Date: (Tue) Apr 19, 2016
Data: Source: Training: https://d37djvu3ytnwxt.cloudfront.net/asset-v1:MITx+15.071x_3+1T2016+type@asset+block/mvtWeek1.csv
New:
Time period:
Based on analysis utilizing <> techniques,
Summary of key steps & error improvement stats:
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
rm(list = ls())
set.seed(12345)
options(stringsAsFactors = FALSE)
source("~/Dropbox/datascience/R/mycaret.R")
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mytm.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
glbCores <- 6 # of cores on machine - 2
registerDoMC(glbCores)
suppressPackageStartupMessages(require(caret))
require(plyr)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(knitr)
## Loading required package: knitr
require(stringr)
## Loading required package: stringr
#source("dbgcaret.R")
#packageVersion("snow")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
# Inputs
# url/name = "<pointer>"; if url specifies a zip file, name = "<filename>"
# sep = choose from c(NULL, "\t")
glbObsTrnFile <- list(url = "https://d37djvu3ytnwxt.cloudfront.net/asset-v1:MITx+15.071x_3+1T2016+type@asset+block/mvtWeek1.csv")
#, splitSpecs = list(method = NULL #select from c(NULL, "condition", "sample", "copy")
# ,nRatio = 0.3 # > 0 && < 1 if method == "sample"
# ,seed = 123 # any integer or glbObsTrnPartitionSeed if method == "sample"
# ,condition = # or 'is.na(<var>)'; '<var> <condition_operator> <value>'
# )
# )
glbObsNewFile <- NULL # default OR list(url = "<obsNewFileName>")
glbInpMerge <- NULL #: default
# list(fnames = c("<fname1>", "<fname2>")) # files will be concatenated
glbObsDropCondition <- #NULL # : default
# enclose in single-quotes b/c condition might include double qoutes
# use | & ; NOT || &&
# '<condition>'
# 'grepl("^First Draft Video:", glbObsAll$Headline)'
# 'is.na(glbObsAll[, glb_rsp_var_raw])'
# '(is.na(glbObsAll[, glb_rsp_var_raw]) & grepl("Train", glbObsAll[, glbFeatsId]))'
'is.na(strptime(glbObsAll[, "Date"], glbFeatsDateTime[["Date"]]["format"], tz = glbFeatsDateTime[["Date"]]["timezone"]))'
#nrow(do.call("subset",list(glbObsAll, parse(text=paste0("!(", glbObsDropCondition, ")")))))
glb_obs_repartition_train_condition <- NULL # : default
# "<condition>"
glb_max_fitobs <- NULL # or any integer
glbObsTrnPartitionSeed <- 123 # or any integer
glb_is_regression <- FALSE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- TRUE # or TRUE or FALSE
glb_rsp_var_raw <- "Arrest"
# for classification, the response variable has to be a factor
glb_rsp_var <- glb_rsp_var_raw # or "Arrest.fctr"
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- #NULL
function(raw) {
# return(raw ^ 0.5)
# return(log(raw))
# return(log(1 + raw))
# return(log10(raw))
# return(exp(-raw / 2))
ret_vals <- rep_len(NA, length(raw)); ret_vals[!is.na(raw)] <- ifelse(raw[!is.na(raw)] == TRUE, "T", "F"); return(relevel(as.factor(ret_vals), ref="F"))
# as.factor(paste0("B", raw))
# as.factor(gsub(" ", "\\.", raw))
}
#if glb_rsp_var_raw is numeric:
#print(summary(glbObsAll[, glb_rsp_var_raw]))
#glb_map_rsp_raw_to_var(tst <- c(NA, as.numeric(summary(glbObsAll[, glb_rsp_var_raw]))))
#if glb_rsp_var_raw is character:
#print(table(glbObsAll[, glb_rsp_var_raw], useNA = "ifany"))
#print(table(glb_map_rsp_raw_to_var(tst <- glbObsAll[, glb_rsp_var_raw]), useNA = "ifany"))
glb_map_rsp_var_to_raw <- #NULL
function(var) {
# return(var ^ 2.0)
# return(exp(var))
# return(10 ^ var)
# return(-log(var) * 2)
# as.numeric(var)
# levels(var)[as.numeric(var)]
# gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
c(FALSE, TRUE)[as.numeric(var)]
}
#print(table(glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(tst)), useNA = "ifany"))
if ((glb_rsp_var != glb_rsp_var_raw) && is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
# List info gathered for various columns
# <col_name>: <description>; <notes>
# ID: a unique identifier for each observation
# Date: the date the crime occurred
# LocationDescription: the location where the crime occurred
# Arrest: whether or not an arrest was made for the crime (TRUE if an arrest was made, and FALSE if an arrest was not made)
# Domestic: whether or not the crime was a domestic crime, meaning that it was committed against a family member (TRUE if it was domestic, and FALSE if it was not domestic)
# Beat: the area, or "beat" in which the crime occurred. This is the smallest regional division defined by the Chicago police department.
# District: the police district in which the crime occured. Each district is composed of many beats, and are defined by the Chicago Police Department.
# CommunityArea: the community area in which the crime occurred. Since the 1920s, Chicago has been divided into what are called "community areas", of which there are now 77. The community areas were devised in an attempt to create socially homogeneous regions.
# Year: the year in which the crime occurred.
# Latitude: the latitude of the location at which the crime occurred.
# Longitude: the longitude of the location at which the crime occurred.
# currently does not handle more than 1 column; consider concatenating multiple columns
# If glbFeatsId == NULL, ".rownames <- as.numeric(row.names())" is the default
glbFeatsId <- "ID" # choose from c(NULL : default, "<id_feat>")
glbFeatsCategory <- "LocationDescription.my" # choose from c(NULL : default, "<category_feat>")
# User-specified exclusions
glbFeatsExclude <- c(NULL
# Feats that shd be excluded due to known causation by prediction variable
# , "<feat1", "<feat2>"
# Feats that are factors with unique values (as % of nObs) > 49 (empirically derived)
# Feats that are linear combinations (alias in glm)
# Feature-engineering phase -> start by excluding all features except id & category & work each one in
, "LocationDescription", "District", "CommunityArea", "Latitude", "Longitude"
)
if (glb_rsp_var_raw != glb_rsp_var)
glbFeatsExclude <- union(glbFeatsExclude, glb_rsp_var_raw)
glbFeatsInteractionOnly <- list()
#glbFeatsInteractionOnly[["<child_feat>"]] <- "<parent_feat>"
glbFeatsDrop <- c(NULL
# , "<feat1>", "<feat2>"
)
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
glb_assign_pairs_lst <- NULL;
# glb_assign_pairs_lst[["<var1>"]] <- list(from=c(NA),
# to=c("NA.my"))
glb_assign_vars <- names(glb_assign_pairs_lst)
# Derived features; Use this mechanism to cleanse data ??? Cons: Data duplication ???
glbFeatsDerive <- list();
# glbFeatsDerive[["<feat.my.sfx>"]] <- list(
# mapfn = function(<arg1>, <arg2>) { return(function(<arg1>, <arg2>)) }
# , args = c("<arg1>", "<arg2>"))
#myprint_df(data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos)))
#data.frame(ImageId = mapfn(glbObsAll$.src, glbObsAll$.pos))[7045:7055, ]
# character
# mapfn = function(Week) { return(substr(Week, 1, 10)) }
# mapfn = function(Name) { return(sapply(Name, function(thsName)
# str_sub(unlist(str_split(thsName, ","))[1], 1, 1))) }
glbFeatsDerive[["LocationDescription.my"]] <- list(
mapfn = function(LocationDescription) { return(plyr::revalue(LocationDescription, c(
# nObs <= 10
"ABANDONED BUILDING" = "Other",
"BRIDGE" = "Other",
"CHURCH/SYNAGOGUE/PLACE OF WORSHIP" = "Other",
"HIGHWAY/EXPRESSWAY" = "Other",
"OTHER" = "Other",
# Could be "Hospital" instead of "Other"
"ANIMAL HOSPITAL" = "Other",
"HOSPITAL BUILDING/GROUNDS" = "Other",
"MEDICAL/DENTAL OFFICE" = "Other",
"NURSING HOME/RETIREMENT HOME" = "Other",
# Could be "Office" instead of "Other"
"CURRENCY EXCHANGE" = "Other",
"BANK" = "Other",
"SAVINGS AND LOAN" = "Other",
"COMMERCIAL / BUSINESS OFFICE" = "Other",
"FACTORY/MANUFACTURING BUILDING" = "Other",
"WAREHOUSE" = "Other",
"CONSTRUCTION SITE" = "Other",
# Could be "Store" instead of "Other"
"APPLIANCE STORE" = "Other",
"BARBERSHOP" = "Other",
"CAR WASH" = "Other",
"CLEANING STORE" = "Other",
"CONVENIENCE STORE" = "Other",
"DEPARTMENT STORE" = "Other",
"DRUG STORE" = "Other",
"GROCERY FOOD STORE" = "Other",
"SMALL RETAIL STORE" = "Other",
"TAVERN/LIQUOR STORE" = "Other",
"BOWLING ALLEY" = "Entertainment",
"ATHLETIC CLUB" = "Entertainment",
"BAR OR TAVERN" = "Entertainment",
"HOTEL/MOTEL" = "Entertainment",
"MOVIE HOUSE/THEATER" = "Entertainment",
"RESTAURANT" = "Entertainment",
"SPORTS ARENA/STADIUM" = "Entertainment",
"LAKEFRONT/WATERFRONT/RIVERBANK" = "Entertainment",
"FOREST PRESERVE" = "Entertainment",
"PARK PROPERTY" = "Entertainment",
"CHA APARTMENT" = "cha",
"CHA PARKING LOT/GROUNDS" = "cha",
"DAY CARE CENTER" = "School",
"SCHOOL, PRIVATE, BUILDING" = "School",
"SCHOOL, PRIVATE, GROUNDS" = "School",
"SCHOOL, PUBLIC, BUILDING" = "School",
"SCHOOL, PUBLIC, GROUNDS" = "School",
"COLLEGE/UNIVERSITY RESIDENCE HALL" = "School",
"LIBRARY" = "School",
"COLLEGE/UNIVERSITY GROUNDS" = "School",
"FIRE STATION" = "Government",
"GOVERNMENT BUILDING/PROPERTY" = "Government",
"POLICE FACILITY/VEH PARKING LOT" = "Government",
"JAIL / LOCK-UP FACILITY" = "Government",
"NEWSSTAND" = "Sidewalk",
"SIDEWALK" = "Sidewalk",
"AIRPORT EXTERIOR - SECURE AREA" = "CommercialVehicle",
"AIRPORT/AIRCRAFT" = "CommercialVehicle",
"AIRPORT EXTERIOR - NON-SECURE AREA" = "CommercialVehicle",
"AIRPORT PARKING LOT" = "CommercialVehicle",
"AIRPORT VENDING ESTABLISHMENT" = "CommercialVehicle",
"AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA" = "CommercialVehicle",
"AIRPORT BUILDING NON-TERMINAL - SECURE AREA" = "CommercialVehicle",
"AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA" = "CommercialVehicle",
"OTHER COMMERCIAL TRANSPORTATION" = "CommercialVehicle",
"TAXICAB" = "CommercialVehicle",
"VEHICLE-COMMERCIAL" = "CommercialVehicle",
"OTHER RAILROAD PROP / TRAIN DEPOT" = "CommercialVehicle",
# Could be "cta" instead of "CommercialVehicle"
"CTA GARAGE / OTHER PROPERTY" = "CommercialVehicle",
"CTA TRAIN" = "CommercialVehicle",
"RESIDENCE PORCH/HALLWAY" = "Residence",
"APARTMENT" = "Residence",
"RESIDENCE-GARAGE" = "Residence",
"RESIDENCE" = "Residence",
"RESIDENTIAL YARD (FRONT/BACK)" = "Residence",
"DRIVEWAY - RESIDENTIAL" = "Residence",
"**" = "**"
))) }
, args = c("LocationDescription"))
# mapfn = function(descriptor) { return(plyr::revalue(descriptor, c(
# "ABANDONED BUILDING" = "OTHER",
# "**" = "**"
# ))) }
# mapfn = function(description) { mod_raw <- description;
# This is here because it does not work if it's in txt_map_filename
# mod_raw <- gsub(paste0(c("\n", "\211", "\235", "\317", "\333"), collapse = "|"), " ", mod_raw)
# Don't parse for "." because of ".com"; use customized gsub for that text
# mod_raw <- gsub("(\\w)(!|\\*|,|-|/)(\\w)", "\\1\\2 \\3", mod_raw);
# Some state acrnoyms need context for separation e.g.
# LA/L.A. could either be "Louisiana" or "LosAngeles"
# modRaw <- gsub("\\bL\\.A\\.( |,|')", "LosAngeles\\1", modRaw);
# OK/O.K. could either be "Oklahoma" or "Okay"
# modRaw <- gsub("\\bACA OK\\b", "ACA OKay", modRaw);
# modRaw <- gsub("\\bNow O\\.K\\.\\b", "Now OKay", modRaw);
# PR/P.R. could either be "PuertoRico" or "Public Relations"
# modRaw <- gsub("\\bP\\.R\\. Campaign", "PublicRelations Campaign", modRaw);
# VA/V.A. could either be "Virginia" or "VeteransAdministration"
# modRaw <- gsub("\\bthe V\\.A\\.\\:", "the VeteranAffairs:", modRaw);
#
# Custom mods
# return(mod_raw) }
# numeric
# Create feature based on record position/id in data
glbFeatsDerive[[".pos"]] <- list(
mapfn = function(.rnorm) { return(1:length(.rnorm)) }
, args = c(".rnorm"))
glbFeatsDerive[[".pos.y"]] <- list(
mapfn = function(.rnorm) { return(1:length(.rnorm)) }
, args = c(".rnorm"))
# Add logs of numerics that are not distributed normally
# Derive & keep multiple transformations of the same feature, if normality is hard to achieve with just one transformation
# Right skew: logp1; sqrt; ^ 1/3; logp1(logp1); log10; exp(-<feat>/constant)
# glbFeatsDerive[["WordCount.log1p"]] <- list(
# mapfn = function(WordCount) { return(log1p(WordCount)) }
# , args = c("WordCount"))
# glbFeatsDerive[["WordCount.root2"]] <- list(
# mapfn = function(WordCount) { return(WordCount ^ (1/2)) }
# , args = c("WordCount"))
# glbFeatsDerive[["WordCount.nexp"]] <- list(
# mapfn = function(WordCount) { return(exp(-WordCount)) }
# , args = c("WordCount"))
#print(summary(glbObsAll$WordCount))
#print(summary(mapfn(glbObsAll$WordCount)))
# If imputation shd be skipped for this feature
glbFeatsDerive[["District.fctr"]] <- list(
mapfn = function(District) {
raw <- District;
ret_vals <- rep_len("NA", length(raw));
ret_vals[!is.na(raw)] <- sapply(raw[!is.na(raw)], function(elm)
ifelse(elm < 10, "1-9",
ifelse(elm < 20, "10-19", "20+")));
return(relevel(as.factor(ret_vals), ref = "NA"))
}
, args = c("District"))
# If imputation of missing data is not working ...
# glbFeatsDerive[["FertilityRate.nonNA"]] <- list(
# mapfn = function(FertilityRate, Region) {
# RegionMdn <- tapply(FertilityRate, Region, FUN = median, na.rm = TRUE)
#
# retVal <- FertilityRate
# retVal[is.na(FertilityRate)] <- RegionMdn[Region[is.na(FertilityRate)]]
# return(retVal)
# }
# , args = c("FertilityRate", "Region"))
# mapfn = function(HOSPI.COST) { return(cut(HOSPI.COST, 5, breaks = c(0, 100000, 200000, 300000, 900000), labels = NULL)) }
# mapfn = function(Rasmussen) { return(ifelse(sign(Rasmussen) >= 0, 1, 0)) }
# mapfn = function(startprice) { return(startprice ^ (1/2)) }
# mapfn = function(startprice) { return(log(startprice)) }
# mapfn = function(startprice) { return(exp(-startprice / 20)) }
# mapfn = function(startprice) { return(scale(log(startprice))) }
# mapfn = function(startprice) { return(sign(sprice.predict.diff) * (abs(sprice.predict.diff) ^ (1/10))) }
# factor
# mapfn = function(PropR) { return(as.factor(ifelse(PropR >= 0.5, "Y", "N"))) }
# mapfn = function(productline, description) { as.factor(gsub(" ", "", productline)) }
# mapfn = function(purpose) { return(relevel(as.factor(purpose), ref="all_other")) }
# mapfn = function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# mapfn = function(startprice.log10) { return(cut(startprice.log10, 3)) }
# mapfn = function(startprice.log10) { return(cut(sprice.predict.diff, c(-1000, -100, -10, -1, 0, 1, 10, 100, 1000))) }
# , args = c("<arg1>"))
# multiple args
# mapfn = function(id, date) { return(paste(as.character(id), as.character(date), sep = "#")) }
# mapfn = function(PTS, oppPTS) { return(PTS - oppPTS) }
# mapfn = function(startprice.log10.predict, startprice) {
# return(spdiff <- (10 ^ startprice.log10.predict) - startprice) }
# mapfn = function(productline, description) { as.factor(
# paste(gsub(" ", "", productline), as.numeric(nchar(description) > 0), sep = "*")) }
# mapfn = function(.src, .pos) {
# return(paste(.src, sprintf("%04d",
# ifelse(.src == "Train", .pos, .pos - 7049)
# ), sep = "#")) }
# # If glbObsAll is not sorted in the desired manner
# mapfn=function(Week) { return(coredata(lag(zoo(orderBy(~Week, glbObsAll)$ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI) { return(coredata(lag(zoo(ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI.2.lag) { return(log(ILI.2.lag)) }
# glbFeatsDerive[["<var1>"]] <- glbFeatsDerive[["<var2>"]]
glb_derive_vars <- names(glbFeatsDerive)
# tst <- "descr.my"; args_lst <- NULL; for (arg in glbFeatsDerive[[tst]]$args) args_lst[[arg]] <- glbObsAll[, arg]; print(head(args_lst[[arg]])); print(head(drv_vals <- do.call(glbFeatsDerive[[tst]]$mapfn, args_lst)));
# print(which_ix <- which(args_lst[[arg]] == 0.75)); print(drv_vals[which_ix]);
glbFeatsDateTime <- list()
glbFeatsDateTime[["Date"]] <-
c(format = "%m/%d/%y %H:%M", timezone = "US/Central", impute.na = FALSE,
last.ctg = FALSE, poly.ctg = FALSE)
# Use OlsonNames() to enumerate supported time zones
# glbFeatsDateTime[["<DateTimeFeat>"]] <-
# c(format = "%Y-%m-%d %H:%M:%S", timezone = "America/New_York", impute.na = TRUE,
# last.ctg = FALSE, poly.ctg = FALSE)
glbFeatsPrice <- NULL # or c("<price_var>")
glbFeatsImage <- list() #list(<imageFeat> = list(patchSize = 10)) # if patchSize not specified, no patch computation
glbFeatsText <- list()
Sys.setlocale("LC_ALL", "C") # For english
## [1] "C/C/C/C/C/en_US.UTF-8"
#glbFeatsText[["<TextFeature>"]] <- list(NULL,
# ,names = myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL,
# <comma-separated-screened-names>
# ))))
# ,rareWords = myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL,
# <comma-separated-nonSCOWL-words>
# ))))
#)
# Text Processing Step: custom modifications not present in txt_munge -> use glbFeatsDerive
# Text Processing Step: universal modifications
glb_txt_munge_filenames_pfx <- "<projectId>_mytxt_"
# Text Processing Step: tolower
# Text Processing Step: myreplacePunctuation
# Text Processing Step: removeWords
glb_txt_stop_words <- list()
# Remember to use unstemmed words
if (length(glbFeatsText) > 0) {
require(tm)
require(stringr)
glb_txt_stop_words[["<txt_var>"]] <- sort(myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# Remove any words from stopwords
# , setdiff(myreplacePunctuation(stopwords("english")), c("<keep_wrd1>", <keep_wrd2>"))
# Remove salutations
,"mr","mrs","dr","Rev"
# Remove misc
#,"th" # Happy [[:digit::]]+th birthday
# Remove terms present in Trn only or New only; search for "Partition post-stem"
# ,<comma-separated-terms>
# cor.y.train == NA
# ,unlist(strsplit(paste(c(NULL
# ,"<comma-separated-terms>"
# ), collapse=",")
# freq == 1; keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# chisq.pval high (e.g. == 1); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
# nzv.freqRatio high (e.g. >= glbFeatsNzvFreqMax); keep c("<comma-separated-terms-to-keep>")
# ,<comma-separated-terms>
)))))
}
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^man", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txtFeat]][, 4866] > 0, c(glb_rsp_var, txtFeat)]
# To identify terms with a specific freq
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], freq == 1)$term), collapse = ",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], freq <= 2)$term), collapse = ",")
#subset(glb_post_stem_words_terms_df_lst[[txtFeat]], term %in% c("zinger"))
# To identify terms with a specific freq &
# are not stemmed together later OR is value of color.fctr (e.g. gold)
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], (freq == 1) & !(term %in% c("blacked","blemish","blocked","blocks","buying","cables","careful","carefully","changed","changing","chargers","cleanly","cleared","connect","connects","connected","contains","cosmetics","default","defaulting","defective","definitely","describe","described","devices","displays","drop","drops","engravement","excellant","excellently","feels","fix","flawlessly","frame","framing","gentle","gold","guarantee","guarantees","handled","handling","having","install","iphone","iphones","keeped","keeps","known","lights","line","lining","liquid","liquidation","looking","lots","manuals","manufacture","minis","most","mostly","network","networks","noted","opening","operated","performance","performs","person","personalized","photograph","physically","placed","places","powering","pre","previously","products","protection","purchasing","returned","rotate","rotation","running","sales","second","seconds","shipped","shuts","sides","skin","skinned","sticker","storing","thats","theres","touching","unusable","update","updates","upgrade","weeks","wrapped","verified","verify") ))$term), collapse = ",")
#print(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (freq <= 2)))
#glbObsAll[which(terms_mtrx[, 229] > 0), glbFeatsText]
# To identify terms with cor.y == NA
#orderBy(~-freq+term, subset(glb_post_stop_words_terms_df_lst[[txtFeat]], is.na(cor.y)))
#paste(sort(subset(glb_post_stop_words_terms_df_lst[[txtFeat]], is.na(cor.y))[, "term"]), collapse=",")
#orderBy(~-freq+term, subset(glb_post_stem_words_terms_df_lst[[txtFeat]], is.na(cor.y)))
# To identify terms with low cor.y.abs
#head(orderBy(~cor.y.abs+freq+term, subset(glb_post_stem_words_terms_df_lst[[txtFeat]], !is.na(cor.y))), 5)
# To identify terms with high chisq.pval
#subset(glb_post_stem_words_terms_df_lst[[txtFeat]], chisq.pval > 0.99)
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (chisq.pval > 0.99) & (freq <= 10))$term), collapse=",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (chisq.pval > 0.9))$term), collapse=",")
#head(orderBy(~-chisq.pval+freq+term, glb_post_stem_words_terms_df_lst[[txtFeat]]), 5)
#glbObsAll[glb_post_stem_words_terms_mtrx_lst[[txtFeat]][, 68] > 0, glbFeatsText]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^m", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
# To identify terms with high nzv.freqRatio
#summary(glb_post_stem_words_terms_df_lst[[txtFeat]]$nzv.freqRatio)
#paste0(sort(setdiff(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], (nzv.freqRatio >= glbFeatsNzvFreqMax) & (freq < 10) & (chisq.pval >= 0.05))$term, c( "128gb","3g","4g","gold","ipad1","ipad3","ipad4","ipadair2","ipadmini2","manufactur","spacegray","sprint","tmobil","verizon","wifion"))), collapse=",")
# To identify obs with a txt term
#tail(orderBy(~-freq+term, glb_post_stop_words_terms_df_lst[[txtFeat]]), 20)
#mydspObs(list(descr.my.contains="non"), cols=c("color", "carrier", "cellular", "storage"))
#grep("ever", dimnames(terms_stop_mtrx)$Terms)
#which(terms_stop_mtrx[, grep("ipad", dimnames(terms_stop_mtrx)$Terms)] > 0)
#glbObsAll[which(terms_stop_mtrx[, grep("16", dimnames(terms_stop_mtrx)$Terms)[1]] > 0), c(glbFeatsCategory, "storage", txtFeat)]
# Text Processing Step: screen for names # Move to glbFeatsText specs section in order of text processing steps
# glbFeatsText[["<txtFeat>"]]$names <- myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# # Person names for names screening
# ,<comma-separated-list>
#
# # Company names
# ,<comma-separated-list>
#
# # Product names
# ,<comma-separated-list>
# ))))
# glbFeatsText[["<txtFeat>"]]$rareWords <- myreplacePunctuation(str_to_lower(gsub(" ", "", c(NULL
# # Words not in SCOWL db
# ,<comma-separated-list>
# ))))
# To identify char vectors post glbFeatsTextMap
#grep("six(.*)hour", glb_txt_chr_lst[[txtFeat]], ignore.case = TRUE, value = TRUE)
#grep("[S|s]ix(.*)[H|h]our", glb_txt_chr_lst[[txtFeat]], value = TRUE)
# To identify whether terms shd be synonyms
#orderBy(~term, glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^moder", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ])
# term_row_df <- glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^came$", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ]
#
# cor(glb_post_stop_words_terms_mtrx_lst[[txtFeat]][glbObsAll$.lcn == "Fit", term_row_df$pos], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# To identify which stopped words are "close" to a txt term
#sort(cluster_vars)
# Text Processing Step: stemDocument
# To identify stemmed txt terms
#glb_post_stop_words_terms_df_lst[[txtFeat]][grep("^la$", glb_post_stop_words_terms_df_lst[[txtFeat]]$term), ]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^con", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
#glbObsAll[which(terms_stem_mtrx[, grep("use", dimnames(terms_stem_mtrx)$Terms)[[1]]] > 0), c(glbFeatsId, "productline", txtFeat)]
#glbObsAll[which(TfIdf_stem_mtrx[, 191] > 0), c(glbFeatsId, glbFeatsCategory, txtFeat)]
#glbObsAll[which(glb_post_stop_words_terms_mtrx_lst[[txtFeat]][, 6165] > 0), c(glbFeatsId, glbFeatsCategory, txtFeat)]
#which(glbObsAll$UniqueID %in% c(11915, 11926, 12198))
# Text Processing Step: mycombineSynonyms
# To identify which terms are associated with not -> combine "could not" & "couldn't"
#findAssocs(glb_full_DTM_lst[[txtFeat]], "not", 0.05)
# To identify which synonyms should be combined
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txtFeat]][grep("^c", glb_post_stem_words_terms_df_lst[[txtFeat]]$term), ])
chk_comb_cor <- function(syn_lst) {
# cor(terms_stem_mtrx[glbObsAll$.src == "Train", grep("^(damag|dent|ding)$", dimnames(terms_stem_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
print(subset(glb_post_stem_words_terms_df_lst[[txtFeat]], term %in% syn_lst$syns))
print(subset(get_corpus_terms(tm_map(glbFeatsTextCorpus[[txtFeat]], mycombineSynonyms, list(syn_lst), lazy=FALSE)), term == syn_lst$word))
# cor(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])], glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
# cor(rowSums(terms_stop_mtrx[glbObsAll$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])]), glbObsTrn[, glb_rsp_var], use="pairwise.complete.obs")
}
#chk_comb_cor(syn_lst=list(word="cabl", syns=c("cabl", "cord")))
#chk_comb_cor(syn_lst=list(word="damag", syns=c("damag", "dent", "ding")))
#chk_comb_cor(syn_lst=list(word="dent", syns=c("dent", "ding")))
#chk_comb_cor(syn_lst=list(word="use", syns=c("use", "usag")))
glbFeatsTextSynonyms <- list()
# list parsed to collect glbFeatsText[[<txtFeat>]]$vldTerms
# glbFeatsTextSynonyms[["Hdln.my"]] <- list(NULL
# # people in places
# , list(word = "australia", syns = c("australia", "australian"))
# , list(word = "italy", syns = c("italy", "Italian"))
# , list(word = "newyork", syns = c("newyork", "newyorker"))
# , list(word = "Pakistan", syns = c("Pakistan", "Pakistani"))
# , list(word = "peru", syns = c("peru", "peruvian"))
# , list(word = "qatar", syns = c("qatar", "qatari"))
# , list(word = "scotland", syns = c("scotland", "scotish"))
# , list(word = "Shanghai", syns = c("Shanghai", "Shanzhai"))
# , list(word = "venezuela", syns = c("venezuela", "venezuelan"))
#
# # companies - needs to be data dependent
# # - e.g. ensure BNP in this experiment/feat always refers to BNPParibas
#
# # general synonyms
# , list(word = "Create", syns = c("Create","Creator"))
# , list(word = "cute", syns = c("cute","cutest"))
# , list(word = "Disappear", syns = c("Disappear","Fadeout"))
# , list(word = "teach", syns = c("teach", "taught"))
# , list(word = "theater", syns = c("theater", "theatre", "theatres"))
# , list(word = "understand", syns = c("understand", "understood"))
# , list(word = "weak", syns = c("weak", "weaken", "weaker", "weakest"))
# , list(word = "wealth", syns = c("wealth", "wealthi"))
#
# # custom synonyms (phrases)
#
# # custom synonyms (names)
# )
#glbFeatsTextSynonyms[["<txtFeat>"]] <- list(NULL
# , list(word="<stem1>", syns=c("<stem1>", "<stem1_2>"))
# )
for (txtFeat in names(glbFeatsTextSynonyms))
for (entryIx in 1:length(glbFeatsTextSynonyms[[txtFeat]])) {
glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$word <-
str_to_lower(glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$word)
glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$syns <-
str_to_lower(glbFeatsTextSynonyms[[txtFeat]][[entryIx]]$syns)
}
glbFeatsTextSeed <- 181
# tm options include: check tm::weightSMART
glb_txt_terms_control <- list( # Gather model performance & run-time stats
# weighting = function(x) weightSMART(x, spec = "nnn")
# weighting = function(x) weightSMART(x, spec = "lnn")
# weighting = function(x) weightSMART(x, spec = "ann")
# weighting = function(x) weightSMART(x, spec = "bnn")
# weighting = function(x) weightSMART(x, spec = "Lnn")
#
weighting = function(x) weightSMART(x, spec = "ltn") # default
# weighting = function(x) weightSMART(x, spec = "lpn")
#
# weighting = function(x) weightSMART(x, spec = "ltc")
#
# weighting = weightBin
# weighting = weightTf
# weighting = weightTfIdf # : default
# termFreq selection criteria across obs: tm default: list(global=c(1, Inf))
, bounds = list(global = c(1, Inf))
# wordLengths selection criteria: tm default: c(3, Inf)
, wordLengths = c(1, Inf)
)
glb_txt_cor_var <- glb_rsp_var # : default # or c(<feat>)
# select one from c("union.top.val.cor", "top.cor", "top.val", default: "top.chisq", "sparse")
glbFeatsTextFilter <- "top.chisq"
glbFeatsTextTermsMax <- rep(10, length(glbFeatsText)) # :default
names(glbFeatsTextTermsMax) <- names(glbFeatsText)
# Text Processing Step: extractAssoc
glbFeatsTextAssocCor <- rep(1, length(glbFeatsText)) # :default
names(glbFeatsTextAssocCor) <- names(glbFeatsText)
# Remember to use stemmed terms
glb_important_terms <- list()
# Text Processing Step: extractPatterns (ngrams)
glbFeatsTextPatterns <- list()
#glbFeatsTextPatterns[[<txtFeat>>]] <- list()
#glbFeatsTextPatterns[[<txtFeat>>]] <- c(metropolitan.diary.colon = "Metropolitan Diary:")
# Have to set it even if it is not used
# Properties:
# numrows(glb_feats_df) << numrows(glbObsFit
# Select terms that appear in at least 0.2 * O(FP/FN(glbObsOOB)) ???
# numrows(glbObsOOB) = 1.1 * numrows(glbObsNew) ???
glb_sprs_thresholds <- NULL # or c(<txtFeat1> = 0.988, <txtFeat2> = 0.970, <txtFeat3> = 0.970)
glbFctrMaxUniqVals <- 20 # default: 20
glb_impute_na_data <- TRUE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- FALSE # : default or TRUE
glb_cluster.seed <- 189 # or any integer
glb_cluster_entropy_var <- NULL # c(glb_rsp_var, as.factor(cut(glb_rsp_var, 3)), default: NULL)
glbFeatsTextClusterVarsExclude <- FALSE # default FALSE
glb_interaction_only_feats <- NULL # : default or c(<parent_feat> = "<child_feat>")
glbFeatsNzvFreqMax <- 19 # 19 : caret default
glbFeatsNzvUniqMin <- 10 # 10 : caret default
glbRFESizes <- list()
#glbRFESizes[["mdlFamily"]] <- c(4, 8, 16, 32, 64, 67, 68, 69) # Accuracy@69/70 = 0.8258
glbObsFitOutliers <- list()
# If outliers.n >= 10; consider concatenation of interaction vars
# glbObsFitOutliers[["<mdlFamily>"]] <- c(NULL
# is.na(.rstudent)
# max(.rstudent)
# is.na(.dffits)
# .hatvalues >= 0.99
# -38,167,642 < minmax(.rstudent) < 49,649,823
# , <comma-separated-<glbFeatsId>>
# )
glbObsTrnOutliers <- list()
glbObsTrnOutliers[["Final"]] <- union(glbObsFitOutliers[["All.X"]],
c(NULL
))
# influence.measures: car::outlier; rstudent; dffits; hatvalues; dfbeta; dfbetas
#mdlId <- "All.X##rcv#glm"; obs_df <- fitobs_df
#mdlId <- "RFE.X.glm"; obs_df <- fitobs_df
#mdlId <- "Final.glm"; obs_df <- trnobs_df
#mdlId <- "CSM2.X.glm"; obs_df <- fitobs_df
#print(outliers <- car::outlierTest(glb_models_lst[[mdlId]]$finalModel))
#mdlIdFamily <- paste0(head(unlist(str_split(mdlId, "\\.")), -1), collapse="."); obs_df <- dplyr::filter_(obs_df, interp(~(!(var %in% glbObsFitOutliers[[mdlIdFamily]])), var = as.name(glbFeatsId))); model_diags_df <- cbind(obs_df, data.frame(.rstudent=stats::rstudent(glb_models_lst[[mdlId]]$finalModel)), data.frame(.dffits=stats::dffits(glb_models_lst[[mdlId]]$finalModel)), data.frame(.hatvalues=stats::hatvalues(glb_models_lst[[mdlId]]$finalModel)));print(summary(model_diags_df[, c(".rstudent",".dffits",".hatvalues")])); table(cut(model_diags_df$.hatvalues, breaks=c(0.00, 0.98, 0.99, 1.00)))
#print(subset(model_diags_df, is.na(.rstudent))[, glbFeatsId])
#print(model_diags_df[which.max(model_diags_df$.rstudent), ])
#print(subset(model_diags_df, is.na(.dffits))[, glbFeatsId])
#print(model_diags_df[which.min(model_diags_df$.dffits), ])
#print(subset(model_diags_df, .hatvalues > 0.99)[, glbFeatsId])
#dffits_df <- merge(dffits_df, outliers_df, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#dffits_df <- merge(dffits_df, glbObsFit, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#subset(dffits_df, !is.na(.Bonf.p))
#mdlId <- "CSM.X.glm"; vars <- myextract_actual_feats(row.names(orderBy(reformulate(c("-", paste0(mdlId, ".imp"))), myget_feats_imp(glb_models_lst[[mdlId]]))));
#model_diags_df <- glb_get_predictions(model_diags_df, mdlId, glb_rsp_var)
#obs_ix <- row.names(model_diags_df) %in% names(outliers$rstudent)[1]
#obs_ix <- which(is.na(model_diags_df$.rstudent))
#obs_ix <- which(is.na(model_diags_df$.dffits))
#myplot_parcoord(obs_df=model_diags_df[, c(glbFeatsId, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, paste0(glb_rsp_var, mdlId), vars[1:min(20, length(vars))])], obs_ix=obs_ix, id_var=glbFeatsId, category_var=glbFeatsCategory)
#model_diags_df[row.names(model_diags_df) %in% names(outliers$rstudent)[c(1:2)], ]
#ctgry_diags_df <- model_diags_df[model_diags_df[, glbFeatsCategory] %in% c("Unknown#0"), ]
#myplot_parcoord(obs_df=ctgry_diags_df[, c(glbFeatsId, glbFeatsCategory, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indepVar[1:20])], obs_ix=row.names(ctgry_diags_df) %in% names(outliers$rstudent)[1], id_var=glbFeatsId, category_var=glbFeatsCategory)
#table(glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), "startprice.log10.cut.fctr"])
#glbObsFit[model_diags_df[, glbFeatsCategory] %in% c("iPad1#1"), c(glbFeatsId, "startprice")]
# No outliers & .dffits == NaN
#myplot_parcoord(obs_df=model_diags_df[, c(glbFeatsId, glbFeatsCategory, glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indepVar[1:10])], obs_ix=seq(1:nrow(model_diags_df))[is.na(model_diags_df$.dffits)], id_var=glbFeatsId, category_var=glbFeatsCategory)
# Modify mdlId to (build & extract) "<FamilyId>#<Fit|Trn>#<caretMethod>#<preProc1.preProc2>#<samplingMethod>"
glb_models_lst <- list(); glb_models_df <- data.frame()
# Regression
if (glb_is_regression) {
glbMdlMethods <- c(NULL
# deterministic
#, "lm", # same as glm
, "glm", "bayesglm", "glmnet"
, "rpart"
# non-deterministic
, "gbm", "rf"
# Unknown
, "nnet" , "avNNet" # runs 25 models per cv sample for tunelength=5
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
, "bagEarth" # Takes a long time
)
} else
# Classification - Add ada (auto feature selection)
if (glb_is_binomial)
glbMdlMethods <- c(NULL
# deterministic
, "bagEarth" # Takes a long time
, "glm", "bayesglm", "glmnet"
, "nnet"
, "rpart"
# non-deterministic
, "gbm"
, "avNNet" # runs 25 models per cv sample for tunelength=5
, "rf"
# Unknown
, "lda", "lda2"
# svm models crash when predict is called -> internal to kernlab it should call predict without .outcome
, "svmLinear", "svmLinear2"
, "svmPoly" # runs 75 models per cv sample for tunelength=5
, "svmRadial"
, "earth"
) else
glbMdlMethods <- c(NULL
# deterministic
,"glmnet"
# non-deterministic
,"rf"
# Unknown
,"gbm","rpart"
)
glbMdlFamilies <- list(); glb_mdl_feats_lst <- list()
# family: Choose from c("RFE.X", "CSM.X", "All.X", "Best.Interact")
# methods: Choose from c(NULL, <method>, glbMdlMethods)
#glbMdlFamilies[["RFE.X"]] <- c("glmnet", "glm") # non-NULL vector is mandatory
glbMdlFamilies[["All.X"]] <- c("glmnet", "glm") # non-NULL vector is mandatory
#glbMdlFamilies[["Best.Interact"]] <- "glmnet" # non-NULL vector is mandatory
# Check if interaction features make RFE better
# glbMdlFamilies[["CSM.X"]] <- setdiff(glbMdlMethods, c("lda", "lda2")) # crashing due to category:.clusterid ??? #c("glmnet", "glm") # non-NULL list is mandatory
# glb_mdl_feats_lst[["CSM.X"]] <- c(NULL
# , <comma-separated-features-vector>
# )
# dAFeats.CSM.X %<d-% c(NULL
# # Interaction feats up to varImp(RFE.X.glmnet) >= 50
# , <comma-separated-features-vector>
# , setdiff(myextract_actual_feats(predictors(rfe_fit_results)), c(NULL
# , <comma-separated-features-vector>
# ))
# )
# glb_mdl_feats_lst[["CSM.X"]] <- "%<d-% dAFeats.CSM.X"
glbMdlFamilies[["Final"]] <- c(NULL) # NULL vector acceptable # c("glmnet", "glm")
glbMdlAllowParallel <- list()
#glbMdlAllowParallel[["<mdlId>"]] <- FALSE
glbMdlAllowParallel[["Max.cor.Y##rcv#rpart"]] <- FALSE
glbMdlAllowParallel[["Max.cor.Y.Time.Poly##rcv#glmnet"]] <- FALSE
glbMdlAllowParallel[["Max.cor.Y.Time.Lag##rcv#glmnet"]] <- FALSE
glbMdlAllowParallel[["Interact.High.cor.Y##rcv#glmnet"]] <- FALSE
glbMdlAllowParallel[["Low.cor.X##rcv#glmnet"]] <- FALSE
glbMdlAllowParallel[["All.X##rcv#glmnet"]] <- FALSE
glbMdlAllowParallel[["All.X##rcv#glm"]] <- FALSE
# Check if tuning parameters make fit better; make it mdlFamily customizable ?
glbMdlTuneParams <- data.frame()
# When glmnet crashes at model$grid with error: ???
glmnetTuneParams <- rbind(data.frame()
,data.frame(parameter = "alpha", vals = "0.100 0.325 0.550 0.775 1.000")
,data.frame(parameter = "lambda", vals = "9.342e-02")
)
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams,
# cbind(data.frame(mdlId = "<mdlId>"),
# glmnetTuneParams))
#avNNet
# size=[1] 3 5 7 9; decay=[0] 1e-04 0.001 0.01 0.1; bag=[FALSE]; RMSE=1.3300906
#bagEarth
# degree=1 [2] 3; nprune=64 128 256 512 [1024]; RMSE=0.6486663 (up)
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "bagEarth", parameter = "nprune", vals = "256")
# ,data.frame(method = "bagEarth", parameter = "degree", vals = "2")
# ))
#earth
# degree=[1]; nprune=2 [9] 17 25 33; RMSE=0.1334478
#gbm
# shrinkage=0.05 [0.10] 0.15 0.20 0.25; n.trees=100 150 200 [250] 300; interaction.depth=[1] 2 3 4 5; n.minobsinnode=[10]; RMSE=0.2008313
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "gbm", parameter = "shrinkage", min = 0.05, max = 0.25, by = 0.05)
# ,data.frame(method = "gbm", parameter = "n.trees", min = 100, max = 300, by = 50)
# ,data.frame(method = "gbm", parameter = "interaction.depth", min = 1, max = 5, by = 1)
# ,data.frame(method = "gbm", parameter = "n.minobsinnode", min = 10, max = 10, by = 10)
# #seq(from=0.05, to=0.25, by=0.05)
# ))
#glmnet
# alpha=0.100 [0.325] 0.550 0.775 1.000; lambda=0.0005232693 0.0024288010 0.0112734954 [0.0523269304] 0.2428800957; RMSE=0.6164891
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha", vals = "0.550 0.775 0.8875 0.94375 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda", vals = "9.858855e-05 0.0001971771 0.0009152152 0.0042480525 0.0197177130")
# ))
#nnet
# size=3 5 [7] 9 11; decay=0.0001 0.001 0.01 [0.1] 0.2; RMSE=0.9287422
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "nnet", parameter = "size", vals = "3 5 7 9 11")
# ,data.frame(method = "nnet", parameter = "decay", vals = "0.0001 0.0010 0.0100 0.1000 0.2000")
# ))
#rf # Don't bother; results are not deterministic
# mtry=2 35 68 [101] 134; RMSE=0.1339974
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "rf", parameter = "mtry", vals = "2 5 9 13 17")
# ))
#rpart
# cp=0.020 [0.025] 0.030 0.035 0.040; RMSE=0.1770237
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "rpart", parameter = "cp", vals = "0.004347826 0.008695652 0.017391304 0.021739130 0.034782609")
# ))
#svmLinear
# C=0.01 0.05 [0.10] 0.50 1.00 2.00 3.00 4.00; RMSE=0.1271318; 0.1296718
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "svmLinear", parameter = "C", vals = "0.01 0.05 0.1 0.5 1")
# ))
#svmLinear2
# cost=0.0625 0.1250 [0.25] 0.50 1.00; RMSE=0.1276354
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method = "svmLinear2", parameter = "cost", vals = "0.0625 0.125 0.25 0.5 1")
# ))
#svmPoly
# degree=[1] 2 3 4 5; scale=0.01 0.05 [0.1] 0.5 1; C=0.50 1.00 [2.00] 3.00 4.00; RMSE=0.1276130
# glbMdlTuneParams <- myrbind_df(glbMdlTuneParams, rbind(data.frame()
# ,data.frame(method="svmPoly", parameter="degree", min=1, max=5, by=1) #seq(1, 5, 1)
# ,data.frame(method="svmPoly", parameter="scale", vals="0.01, 0.05, 0.1, 0.5, 1")
# ,data.frame(method="svmPoly", parameter="C", vals="0.50, 1.00, 2.00, 3.00, 4.00")
# ))
#svmRadial
# sigma=[0.08674323]; C=0.25 0.50 1.00 [2.00] 4.00; RMSE=0.1614957
#glb2Sav(); all.equal(sav_models_df, glb_models_df)
glb_preproc_methods <- NULL
# c("YeoJohnson", "center.scale", "range", "pca", "ica", "spatialSign")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<feat>")
glbMdlMetric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glbMdlMetricSummary <- NULL # or "<metric_name>"
glbMdlMetricMaximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glbMdlMetricSummaryFn <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glbMdlMetric_terms)
# metric <- sum(confusion_mtrx * glbMdlMetric_terms) / nrow(data)
# names(metric) <- glbMdlMetricSummary
# return(metric)
# }
glbMdlCheckRcv <- FALSE # Turn it on when needed; otherwise takes long time
glb_rcv_n_folds <- 3 # or NULL
glb_rcv_n_repeats <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glbMdlMetricsEval <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit", "min.RMSE.fit")
#glbMdlMetricsEval <- c("min.RMSE.fit", "max.R.sq.fit", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glbMdlMetricsEval <-
c("max.Accuracy.OOB", "max.AUCROCR.OOB", "max.AUCpROC.OOB", "min.aic.fit", "max.Accuracy.fit") else
glbMdlMetricsEval <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
# select from NULL [no ensemble models], "auto" [all models better than MFO or Baseline], c(mdl_ids in glb_models_lst) [Typically top-rated models in auto]
glb_mdl_ensemble <- NULL
# "%<d-% setdiff(mygetEnsembleAutoMdlIds(), 'CSM.X.rf')"
# c(<comma-separated-mdlIds>
# )
# Only for classifications; for regressions remove "(.*)\\.prob" form the regex
# tmp_fitobs_df <- glbObsFit[, grep(paste0("^", gsub(".", "\\.", mygetPredictIds$value, fixed = TRUE), "CSM\\.X\\.(.*)\\.prob"), names(glbObsFit), value = TRUE)]; cor_mtrx <- cor(tmp_fitobs_df); cor_vctr <- sort(cor_mtrx[row.names(orderBy(~-Overall, varImp(glb_models_lst[["Ensemble.repeatedcv.glmnet"]])$imp))[1], ]); summary(cor_vctr); cor_vctr
#ntv.glm <- glm(reformulate(indepVar, glb_rsp_var), family = "binomial", data = glbObsFit)
#step.glm <- step(ntv.glm)
glb_sel_mdl_id <- "All.X##rcv#glmnet" #select from c(NULL, "All.X##rcv#glmnet", "RFE.X##rcv#glmnet", <mdlId>)
glb_fin_mdl_id <- NULL #select from c(NULL, glb_sel_mdl_id)
glb_dsp_cols <- c(".pos", glbFeatsId, glbFeatsCategory, glb_rsp_var
# List critical cols excl. above
)
# Output specs
# lclgetfltout_df <- function(obsout_df) {
# require(tidyr)
# obsout_df <- obsout_df %>%
# tidyr::separate("ImageId.x.y", c(".src", ".pos", "x", "y"),
# sep = "#", remove = TRUE, extra = "merge")
# # mnm prefix stands for max_n_mean
# mnmout_df <- obsout_df %>%
# dplyr::group_by(.pos) %>%
# #dplyr::top_n(1, Probability1) %>% # Score = 3.9426
# #dplyr::top_n(2, Probability1) %>% # Score = ???; weighted = 3.94254;
# #dplyr::top_n(3, Probability1) %>% # Score = 3.9418; weighted = 3.94169;
# dplyr::top_n(4, Probability1) %>% # Score = ???; weighted = 3.94149;
# #dplyr::top_n(5, Probability1) %>% # Score = 3.9421; weighted = 3.94178
#
# # dplyr::summarize(xMeanN = mean(as.numeric(x)), yMeanN = mean(as.numeric(y)))
# # dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), Probability1), yMeanN = mean(as.numeric(y)))
# # dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), c(Probability1, 0.2357323, 0.2336925)), yMeanN = mean(as.numeric(y)))
# # dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), c(Probability1)), yMeanN = mean(as.numeric(y)))
# dplyr::summarize(xMeanN = weighted.mean(as.numeric(x), c(Probability1)),
# yMeanN = weighted.mean(as.numeric(y), c(Probability1)))
#
# maxout_df <- obsout_df %>%
# dplyr::group_by(.pos) %>%
# dplyr::summarize(maxProb1 = max(Probability1))
# fltout_df <- merge(maxout_df, obsout_df,
# by.x = c(".pos", "maxProb1"), by.y = c(".pos", "Probability1"),
# all.x = TRUE)
# fmnout_df <- merge(fltout_df, mnmout_df,
# by.x = c(".pos"), by.y = c(".pos"),
# all.x = TRUE)
# return(fmnout_df)
# }
glbObsOut <- list(NULL
# glbFeatsId will be the first output column, by default
,vars = list()
# ,mapFn = function(obsout_df) {
# }
)
#obsout_df <- savobsout_df
# glbObsOut$mapFn <- function(obsout_df) {
# txfout_df <- dplyr::select(obsout_df, -.pos.y) %>%
# dplyr::mutate(
# lunch = levels(glbObsTrn[, "lunch" ])[
# round(mean(as.numeric(glbObsTrn[, "lunch" ])), 0)],
# dinner = levels(glbObsTrn[, "dinner" ])[
# round(mean(as.numeric(glbObsTrn[, "dinner" ])), 0)],
# reserve = levels(glbObsTrn[, "reserve" ])[
# round(mean(as.numeric(glbObsTrn[, "reserve" ])), 0)],
# outdoor = levels(glbObsTrn[, "outdoor" ])[
# round(mean(as.numeric(glbObsTrn[, "outdoor" ])), 0)],
# expensive = levels(glbObsTrn[, "expensive"])[
# round(mean(as.numeric(glbObsTrn[, "expensive"])), 0)],
# liquor = levels(glbObsTrn[, "liquor" ])[
# round(mean(as.numeric(glbObsTrn[, "liquor" ])), 0)],
# table = levels(glbObsTrn[, "table" ])[
# round(mean(as.numeric(glbObsTrn[, "table" ])), 0)],
# classy = levels(glbObsTrn[, "classy" ])[
# round(mean(as.numeric(glbObsTrn[, "classy" ])), 0)],
# kids = levels(glbObsTrn[, "kids" ])[
# round(mean(as.numeric(glbObsTrn[, "kids" ])), 0)]
# )
#
# print("ObsNew output class tables:")
# print(sapply(c("lunch","dinner","reserve","outdoor",
# "expensive","liquor","table",
# "classy","kids"),
# function(feat) table(txfout_df[, feat], useNA = "ifany")))
#
# txfout_df <- txfout_df %>%
# dplyr::mutate(labels = "") %>%
# dplyr::mutate(labels =
# ifelse(lunch != "-1", paste(labels, lunch ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(dinner != "-1", paste(labels, dinner ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(reserve != "-1", paste(labels, reserve ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(outdoor != "-1", paste(labels, outdoor ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(expensive != "-1", paste(labels, expensive), labels)) %>%
# dplyr::mutate(labels =
# ifelse(liquor != "-1", paste(labels, liquor ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(table != "-1", paste(labels, table ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(classy != "-1", paste(labels, classy ), labels)) %>%
# dplyr::mutate(labels =
# ifelse(kids != "-1", paste(labels, kids ), labels)) %>%
# dplyr::select(business_id, labels)
# return(txfout_df)
# }
#if (!is.null(glbObsOut$mapFn)) obsout_df <- glbObsOut$mapFn(obsout_df); print(head(obsout_df))
glb_out_obs <- NULL # select from c(NULL : default to "new", "all", "new", "trn")
if (glb_is_classification && glb_is_binomial) {
glbObsOut$vars[["Probability1"]] <-
"%<d-% glbObsNew[, mygetPredictIds(glb_rsp_var, glb_fin_mdl_id)$prob]"
# glbObsOut$vars[[glb_rsp_var_raw]] <-
# "%<d-% glb_map_rsp_var_to_raw(glbObsNew[,
# mygetPredictIds(glb_rsp_var, glb_fin_mdl_id)$value])"
} else {
# glbObsOut$vars[[glbFeatsId]] <-
# "%<d-% as.integer(gsub('Test#', '', glbObsNew[, glbFeatsId]))"
glbObsOut$vars[[glb_rsp_var]] <-
"%<d-% glbObsNew[, mygetPredictIds(glb_rsp_var, glb_fin_mdl_id)$value]"
# for (outVar in setdiff(glbFeatsExcludeLcl, glb_rsp_var_raw))
# glbObsOut$vars[[outVar]] <-
# paste0("%<d-% mean(glbObsAll[, \"", outVar, "\"], na.rm = TRUE)")
}
# glbObsOut$vars[[glb_rsp_var_raw]] <- glb_rsp_var_raw
# glbObsOut$vars[[paste0(head(unlist(strsplit(mygetPredictIds$value, "")), -1), collapse = "")]] <-
glbOutStackFnames <- NULL #: default
# c("ebayipads_txt_assoc1_out_bid1_stack.csv") # manual stack
# c("ebayipads_finmdl_bid1_out_nnet_1.csv") # universal stack
glbOut <- list(pfx = "MVT_Chicago_2016_")
# lclImageSampleSeed <- 129
glbOutDataVizFname <- NULL # choose from c(NULL, "<projectId>_obsall.csv")
glbChunks <- list(labels = c("set_global_options_wd","set_global_options"
,"import.data","inspect.data","scrub.data","transform.data"
,"extract.features"
,"extract.features.datetime","extract.features.image","extract.features.price"
,"extract.features.text","extract.features.string"
,"extract.features.end"
,"manage.missing.data","cluster.data","partition.data.training","select.features"
,"fit.models_0","fit.models_1","fit.models_2","fit.models_3"
,"fit.data.training_0","fit.data.training_1"
,"predict.data.new"
,"display.session.info"))
# To ensure that all chunks in this script are in glbChunks
if (!is.null(chkChunksLabels <- knitr::all_labels()) && # knitr::all_labels() doesn't work in console runs
!identical(chkChunksLabels, glbChunks$labels)) {
print(sprintf("setdiff(chkChunksLabels, glbChunks$labels): %s",
setdiff(chkChunksLabels, glbChunks$labels)))
print(sprintf("setdiff(glbChunks$labels, chkChunksLabels): %s",
setdiff(glbChunks$labels, chkChunksLabels)))
}
glbChunks[["first"]] <- NULL #default: script will load envir from previous chunk
glbChunks[["last"]] <- NULL #"extract.features.end" #NULL #default: script will save envir at end of this chunk
#mysavChunk(glbOut$pfx, glbChunks[["last"]])
# Inspect max OOB FP
#chkObsOOB <- subset(glbObsOOB, !label.fctr.All.X..rcv.glmnet.is.acc)
#chkObsOOBFP <- subset(chkObsOOB, label.fctr.All.X..rcv.glmnet == "left_eye_center") %>% dplyr::mutate(Probability1 = label.fctr.All.X..rcv.glmnet.prob) %>% select(-.src, -.pos, -x, -y) %>% lclgetfltout_df() %>% mutate(obj.distance = (((as.numeric(x) - left_eye_center_x.int) ^ 2) + ((as.numeric(y) - left_eye_center_y.int) ^ 2)) ^ 0.5) %>% dplyr::top_n(5, obj.distance) %>% dplyr::top_n(5, -patch.cor)
#
#newImgObs <- glbObsNew[(glbObsNew$ImageId == "Test#0001"), ]; print(newImgObs[which.max(newImgObs$label.fctr.Final..rcv.glmnet.prob), ])
#OOBImgObs <- glbObsOOB[(glbObsOOB$ImageId == "Train#0003"), ]; print(OOBImgObs[which.max(OOBImgObs$label.fctr.All.X..rcv.glmnet.prob), ])
#load("MVT_Chicago_2016_extract.features.end.RData", verbose = TRUE)
#mygetImage(which(glbObsAll[, glbFeatsId] == "Train#0003"), names(glbFeatsImage)[1], plot = TRUE, featHighlight = c("left_eye_center_x", "left_eye_center_y"), ovrlHighlight = c(66, 35))
# Depict process
glb_analytics_pn <- petrinet(name = "glb_analytics_pn",
trans_df = data.frame(id = 1:6,
name = c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df = data.frame(
begin = c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end = c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 11.867 NA NA
1.0: import data## [1] "Reading file ./data/mvtWeek1.csv..."
## [1] "dimensions of data in ./data/mvtWeek1.csv: 191,641 rows x 11 cols"
## ID Date LocationDescription Arrest Domestic
## 1 8951354 12/31/12 23:15 STREET FALSE FALSE
## 2 8951141 12/31/12 22:00 STREET FALSE FALSE
## 3 8952745 12/31/12 22:00 RESIDENTIAL YARD (FRONT/BACK) FALSE FALSE
## 4 8952223 12/31/12 22:00 STREET FALSE FALSE
## 5 8951608 12/31/12 21:30 STREET FALSE FALSE
## 6 8950793 12/31/12 20:30 STREET TRUE FALSE
## Beat District CommunityArea Year Latitude Longitude
## 1 623 6 69 2012 41.75628 -87.62164
## 2 1213 12 24 2012 41.89879 -87.66130
## 3 1622 16 11 2012 41.96919 -87.76767
## 4 724 7 67 2012 41.76933 -87.65773
## 5 211 2 35 2012 41.83757 -87.62176
## 6 2521 25 19 2012 41.92856 -87.75400
## ID Date LocationDescription Arrest Domestic Beat
## 6619 8702591 7/12/12 0:00 STREET FALSE FALSE 1924
## 29201 7887067 1/11/11 18:00 STREET FALSE FALSE 2223
## 97589 4689109 4/11/06 11:00 STREET FALSE FALSE 2514
## 139458 2984390 10/9/03 21:00 STREET FALSE FALSE 1724
## 140984 2937323 9/7/03 20:30 SCHOOL, PUBLIC, GROUNDS FALSE FALSE 1134
## 189673 1375824 2/10/01 23:55 STREET FALSE FALSE 331
## District CommunityArea Year Latitude Longitude
## 6619 19 6 2012 41.94467 -87.64956
## 29201 22 73 2011 41.71918 -87.64330
## 97589 25 19 2006 41.93117 -87.76612
## 139458 17 16 2003 41.95781 -87.70707
## 140984 11 27 2003 41.87330 -87.70605
## 189673 NA NA 2001 41.76816 -87.56890
## ID Date LocationDescription Arrest Domestic Beat
## 191636 1310755 1/1/01 0:05 RESIDENCE-GARAGE FALSE FALSE 2332
## 191637 1310068 1/1/01 0:05 STREET FALSE FALSE 1123
## 191638 1313404 1/1/01 0:01 STREET FALSE FALSE 2023
## 191639 1313442 1/1/01 0:01 STREET FALSE FALSE 911
## 191640 1563324 1/1/01 0:01 STREET FALSE FALSE 1712
## 191641 1310463 1/1/01 0:01 STREET FALSE FALSE 1911
## District CommunityArea Year Latitude Longitude
## 191636 NA NA 2001 41.93978 -87.63969
## 191637 NA NA 2001 41.88757 -87.71132
## 191638 NA NA 2001 41.98192 -87.65888
## 191639 NA NA 2001 41.79902 -87.69654
## 191640 NA NA 2001 41.97144 -87.72706
## 191641 NA NA 2001 41.96833 -87.69639
## 'data.frame': 191641 obs. of 11 variables:
## $ ID : int 8951354 8951141 8952745 8952223 8951608 8950793 8950760 8951611 8951802 8950706 ...
## $ Date : chr "12/31/12 23:15" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 22:00" ...
## $ LocationDescription: chr "STREET" "STREET" "RESIDENTIAL YARD (FRONT/BACK)" "STREET" ...
## $ Arrest : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ Domestic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Beat : int 623 1213 1622 724 211 2521 423 231 1021 1215 ...
## $ District : int 6 12 16 7 2 25 4 2 10 12 ...
## $ CommunityArea : int 69 24 11 67 35 19 48 40 29 24 ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude : num -87.6 -87.7 -87.8 -87.7 -87.6 ...
## - attr(*, "comment")= chr "glbObsTrn"
## NULL
## Warning: No file specified for glbObsNew & splitSpecs$method not specified.
## Defaulting to copy.
## ID Date LocationDescription Arrest Domestic
## 1 8951354 12/31/12 23:15 STREET FALSE FALSE
## 2 8951141 12/31/12 22:00 STREET FALSE FALSE
## 3 8952745 12/31/12 22:00 RESIDENTIAL YARD (FRONT/BACK) FALSE FALSE
## 4 8952223 12/31/12 22:00 STREET FALSE FALSE
## 5 8951608 12/31/12 21:30 STREET FALSE FALSE
## 6 8950793 12/31/12 20:30 STREET TRUE FALSE
## Beat District CommunityArea Year Latitude Longitude
## 1 623 6 69 2012 41.75628 -87.62164
## 2 1213 12 24 2012 41.89879 -87.66130
## 3 1622 16 11 2012 41.96919 -87.76767
## 4 724 7 67 2012 41.76933 -87.65773
## 5 211 2 35 2012 41.83757 -87.62176
## 6 2521 25 19 2012 41.92856 -87.75400
## ID Date LocationDescription Arrest Domestic Beat
## 218 8946210 12/27/12 7:00 STREET FALSE FALSE 2525
## 34296 7736201 9/29/10 16:00 STREET FALSE FALSE 1922
## 74384 5866687 10/27/07 15:00 STREET FALSE FALSE 822
## 74971 5837175 10/14/07 2:00 STREET FALSE FALSE 1111
## 77132 5730266 8/19/07 19:00 STREET FALSE FALSE 1222
## 88633 5086302 10/30/06 1:00 STREET FALSE FALSE 825
## District CommunityArea Year Latitude Longitude
## 218 25 22 2012 41.92095 -87.72438
## 34296 19 6 2010 41.96072 -87.67225
## 74384 8 63 2007 41.79707 -87.70380
## 74971 11 23 2007 41.89786 -87.73393
## 77132 12 31 2007 41.85477 -87.66362
## 88633 8 66 2006 41.78253 -87.68969
## ID Date LocationDescription Arrest Domestic Beat
## 191636 1310755 1/1/01 0:05 RESIDENCE-GARAGE FALSE FALSE 2332
## 191637 1310068 1/1/01 0:05 STREET FALSE FALSE 1123
## 191638 1313404 1/1/01 0:01 STREET FALSE FALSE 2023
## 191639 1313442 1/1/01 0:01 STREET FALSE FALSE 911
## 191640 1563324 1/1/01 0:01 STREET FALSE FALSE 1712
## 191641 1310463 1/1/01 0:01 STREET FALSE FALSE 1911
## District CommunityArea Year Latitude Longitude
## 191636 NA NA 2001 41.93978 -87.63969
## 191637 NA NA 2001 41.88757 -87.71132
## 191638 NA NA 2001 41.98192 -87.65888
## 191639 NA NA 2001 41.79902 -87.69654
## 191640 NA NA 2001 41.97144 -87.72706
## 191641 NA NA 2001 41.96833 -87.69639
## 'data.frame': 191641 obs. of 11 variables:
## $ ID : int 8951354 8951141 8952745 8952223 8951608 8950793 8950760 8951611 8951802 8950706 ...
## $ Date : chr "12/31/12 23:15" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 22:00" ...
## $ LocationDescription: chr "STREET" "STREET" "RESIDENTIAL YARD (FRONT/BACK)" "STREET" ...
## $ Arrest : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ Domestic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Beat : int 623 1213 1622 724 211 2521 423 231 1021 1215 ...
## $ District : int 6 12 16 7 2 25 4 2 10 12 ...
## $ CommunityArea : int 69 24 11 67 35 19 48 40 29 24 ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude : num -87.6 -87.7 -87.8 -87.7 -87.6 ...
## - attr(*, "comment")= chr "glbObsNew"
## ID Date LocationDescription Arrest Domestic
## 1 8951354 12/31/12 23:15 STREET FALSE FALSE
## 2 8951141 12/31/12 22:00 STREET FALSE FALSE
## 3 8952745 12/31/12 22:00 RESIDENTIAL YARD (FRONT/BACK) FALSE FALSE
## 4 8952223 12/31/12 22:00 STREET FALSE FALSE
## 5 8951608 12/31/12 21:30 STREET FALSE FALSE
## 6 8950793 12/31/12 20:30 STREET TRUE FALSE
## Beat District CommunityArea Year Latitude Longitude
## 1 623 6 69 2012 41.75628 -87.62164
## 2 1213 12 24 2012 41.89879 -87.66130
## 3 1622 16 11 2012 41.96919 -87.76767
## 4 724 7 67 2012 41.76933 -87.65773
## 5 211 2 35 2012 41.83757 -87.62176
## 6 2521 25 19 2012 41.92856 -87.75400
## ID Date LocationDescription Arrest Domestic Beat
## 62619 6441079 8/17/08 1:20 STREET FALSE FALSE 1114
## 86953 5177562 12/11/06 2:00 SIDEWALK FALSE FALSE 1333
## 123518 3694168 9/17/04 12:00 STREET FALSE FALSE 823
## 135580 3122805 12/31/03 14:00 DRIVEWAY - RESIDENTIAL FALSE FALSE 1114
## 182377 1616302 7/5/01 17:00 STREET FALSE FALSE 725
## 185011 1536051 5/18/01 0:01 STREET FALSE FALSE 831
## District CommunityArea Year Latitude Longitude
## 62619 11 26 2008 41.88209 -87.73107
## 86953 13 28 2006 41.88541 -87.66713
## 123518 8 66 2004 41.77787 -87.70729
## 135580 11 26 2003 41.88454 -87.72822
## 182377 NA NA 2001 41.77576 -87.66153
## 185011 NA NA 2001 41.77242 -87.70795
## ID Date LocationDescription Arrest Domestic Beat
## 191636 1310755 1/1/01 0:05 RESIDENCE-GARAGE FALSE FALSE 2332
## 191637 1310068 1/1/01 0:05 STREET FALSE FALSE 1123
## 191638 1313404 1/1/01 0:01 STREET FALSE FALSE 2023
## 191639 1313442 1/1/01 0:01 STREET FALSE FALSE 911
## 191640 1563324 1/1/01 0:01 STREET FALSE FALSE 1712
## 191641 1310463 1/1/01 0:01 STREET FALSE FALSE 1911
## District CommunityArea Year Latitude Longitude
## 191636 NA NA 2001 41.93978 -87.63969
## 191637 NA NA 2001 41.88757 -87.71132
## 191638 NA NA 2001 41.98192 -87.65888
## 191639 NA NA 2001 41.79902 -87.69654
## 191640 NA NA 2001 41.97144 -87.72706
## 191641 NA NA 2001 41.96833 -87.69639
## 'data.frame': 191641 obs. of 11 variables:
## $ ID : int 8951354 8951141 8952745 8952223 8951608 8950793 8950760 8951611 8951802 8950706 ...
## $ Date : chr "12/31/12 23:15" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 22:00" ...
## $ LocationDescription: chr "STREET" "STREET" "RESIDENTIAL YARD (FRONT/BACK)" "STREET" ...
## $ Arrest : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
## $ Domestic : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Beat : int 623 1213 1622 724 211 2521 423 231 1021 1215 ...
## $ District : int 6 12 16 7 2 25 4 2 10 12 ...
## $ CommunityArea : int 69 24 11 67 35 19 48 40 29 24 ...
## $ Year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ Latitude : num 41.8 41.9 42 41.8 41.8 ...
## $ Longitude : num -87.6 -87.7 -87.8 -87.7 -87.6 ...
## - attr(*, "comment")= chr "glbObsTrn"
## Warning: glbObsTrn same as glbObsAll
## Warning: glbObsNew same as glbObsAll
## [1] "Creating new feature: LocationDescription.my..."
## The following `from` values were not present in `x`: **
## [1] "Creating new feature: .pos..."
## [1] "Creating new feature: .pos.y..."
## [1] "Creating new feature: District.fctr..."
## [1] "Partition stats:"
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## Arrest .src .n
## 1 FALSE Test 176105
## 2 FALSE Train 176105
## 3 TRUE Test 15536
## 4 TRUE Train 15536
## Arrest .src .n
## 1 FALSE Test 176105
## 2 FALSE Train 176105
## 3 TRUE Test 15536
## 4 TRUE Train 15536
## Loading required package: RColorBrewer
## .src .n
## 1 Test 191641
## 2 Train 191641
## [1] "Running glbObsDropCondition filter: is.na(strptime(glbObsAll[, \"Date\"], glbFeatsDateTime[[\"Date\"]][\"format\"], tz = glbFeatsDateTime[[\"Date\"]][\"timezone\"]))"
## [1] "Partition stats:"
## Arrest .src .n
## 1 FALSE Test 176093
## 2 FALSE Train 176093
## 3 TRUE Test 15536
## 4 TRUE Train 15536
## Arrest .src .n
## 1 FALSE Test 176093
## 2 FALSE Train 176093
## 3 TRUE Test 15536
## 4 TRUE Train 15536
## .src .n
## 1 Test 191629
## 2 Train 191629
## Loading required package: lazyeval
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## [1] "Found 0 duplicates by all features:"
## NULL
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 11.867 85.116 73.249
## 2 inspect.data 2 0 0 85.117 NA NA
2.0: inspect data## Loading required package: reshape2
## Arrest.FALSE Arrest.TRUE
## Test 176093 15536
## Train 176093 15536
## Arrest.FALSE Arrest.TRUE
## Test 0.9189267 0.08107332
## Train 0.9189267 0.08107332
## [1] "numeric data missing in : "
## District CommunityArea Latitude Longitude
## 86110 49232 4552 4552
## [1] "numeric data w/ 0s in : "
## Arrest Domestic CommunityArea
## 352186 382428 4
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## Date LocationDescription LocationDescription.my
## 0 0 0
## Arrest Arrest .n
## 1 F F 352186
## 2 T T 31072
## Arrest.F Arrest.T
## Test 176093 15536
## Train 176093 15536
## Arrest.F Arrest.T
## Test 0.9189267 0.08107332
## Train 0.9189267 0.08107332
## Loading required package: caTools
## Warning: position_stack requires non-overlapping x intervals
## [1] "elapsed Time (secs): 21.183000"
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## Scale for 'y' is already present. Adding another scale for 'y', which
## will replace the existing scale.
## [1] "elapsed Time (secs): 7.187000"
## [1] "elapsed Time (secs): 7.187000"
## label step_major step_minor label_minor bgn end elapsed
## 2 inspect.data 2 0 0 85.117 122.158 37.042
## 3 scrub.data 2 1 1 122.159 NA NA
2.1: scrub data## [1] "numeric data missing in : "
## District CommunityArea Latitude Longitude
## 86110 49232 4552 4552
## [1] "numeric data w/ 0s in : "
## Domestic CommunityArea
## 382428 4
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## Date LocationDescription LocationDescription.my
## 0 0 0
## label step_major step_minor label_minor bgn end elapsed
## 3 scrub.data 2 1 1 122.159 128.129 5.97
## 4 transform.data 2 2 2 128.129 NA NA
2.2: transform data## label step_major step_minor label_minor bgn end
## 4 transform.data 2 2 2 128.129 128.172
## 5 extract.features 3 0 0 128.173 NA
## elapsed
## 4 0.044
## 5 NA
3.0: extract features## label step_major step_minor label_minor bgn
## 5 extract.features 3 0 0 128.173
## 6 extract.features.datetime 3 1 1 128.198
## end elapsed
## 5 128.197 0.024
## 6 NA NA
3.1: extract features datetime## label step_major step_minor label_minor bgn
## 1 extract.features.datetime.bgn 1 0 0 128.245
## end elapsed
## 1 NA NA
## label step_major step_minor label_minor
## 1 extract.features.datetime.bgn 1 0 0
## 2 extract.features_xtract.DateTime.vars 2 0 0
## bgn end elapsed
## 1 128.245 128.253 0.008
## 2 128.254 NA NA
## [1] "Extracting features from DateTime(s): Date"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Loading required package: XML
## [1] " accessing url: http://about.usps.com/news/events-calendar/2001-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2001-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2002-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2002-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2003-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2003-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2004-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2004-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2005-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2005-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2006-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2006-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2007-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2007-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2008-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2008-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2009-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2009-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2010-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : unable to access url:http://about.usps.com/news/
## events-calendar/2010-federal-holidays.htm; skipping ...
## [1] " accessing url: http://about.usps.com/news/events-calendar/2011-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : US Federal Holidays not found for year: 2011
## [1] " accessing url: http://about.usps.com/news/events-calendar/2012-federal-holidays.htm"
## Warning in myextract_dates_df(df = glbObsAll, vars =
## names(glbFeatsDateTime), : xpathSApply did not work for url:http://
## about.usps.com/news/events-calendar/2012-federal-holidays.htm; skipping ...
## [1] "**********"
## [1] "Consider adding state & city holidays for glbFeatsDateTime: Date"
## [1] "**********"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## label step_major step_minor label_minor bgn
## 6 extract.features.datetime 3 1 1 128.198
## 7 extract.features.image 3 2 2 416.398
## end elapsed
## 6 416.397 288.2
## 7 NA NA
3.2: extract features image## label step_major step_minor label_minor bgn end
## 1 extract.features.image.bgn 1 0 0 490.244 NA
## elapsed
## 1 NA
## label step_major step_minor label_minor bgn
## 1 extract.features.image.bgn 1 0 0 490.244
## 2 extract.features.image.end 2 0 0 490.254
## end elapsed
## 1 490.253 0.01
## 2 NA NA
## label step_major step_minor label_minor bgn
## 1 extract.features.image.bgn 1 0 0 490.244
## 2 extract.features.image.end 2 0 0 490.254
## end elapsed
## 1 490.253 0.01
## 2 NA NA
## label step_major step_minor label_minor bgn end
## 7 extract.features.image 3 2 2 416.398 490.264
## 8 extract.features.price 3 3 3 490.265 NA
## elapsed
## 7 73.866
## 8 NA
3.3: extract features price## label step_major step_minor label_minor bgn end
## 1 extract.features.price.bgn 1 0 0 490.291 NA
## elapsed
## 1 NA
## label step_major step_minor label_minor bgn end
## 8 extract.features.price 3 3 3 490.265 490.3
## 9 extract.features.text 3 4 4 490.301 NA
## elapsed
## 8 0.035
## 9 NA
3.4: extract features text## label step_major step_minor label_minor bgn end
## 1 extract.features.text.bgn 1 0 0 490.345 NA
## elapsed
## 1 NA
## label step_major step_minor label_minor bgn
## 9 extract.features.text 3 4 4 490.301
## 10 extract.features.string 3 5 5 490.355
## end elapsed
## 9 490.354 0.053
## 10 NA NA
3.5: extract features string## label step_major step_minor label_minor bgn
## 1 extract.features.string.bgn 1 0 0 490.425
## end elapsed
## 1 NA NA
## label step_major step_minor
## 1 extract.features.string.bgn 1 0
## 2 extract.features.stringfactorize.str.vars 2 0
## label_minor bgn end elapsed
## 1 0 490.425 490.435 0.01
## 2 0 490.436 NA NA
## Date LocationDescription .src
## "Date" "LocationDescription" ".src"
## LocationDescription.my
## "LocationDescription.my"
## Warning: Creating factors of string variable: LocationDescription.my: # of
## unique values: 14
## label step_major step_minor label_minor bgn
## 10 extract.features.string 3 5 5 490.355
## 11 extract.features.end 3 6 6 490.555
## end elapsed
## 10 490.554 0.199
## 11 NA NA
3.6: extract features end## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## label step_major step_minor label_minor bgn end
## 11 extract.features.end 3 6 6 490.555 491.507
## 12 manage.missing.data 4 0 0 491.507 NA
## elapsed
## 11 0.952
## 12 NA
4.0: manage missing data## [1] "numeric data missing in : "
## District CommunityArea Latitude Longitude
## 86110 49232 4552 4552
## [1] "numeric data w/ 0s in : "
## Domestic CommunityArea Date.wkday.fctr Date.wkend
## 382428 4 52608 276414
## Date.hlday Date.second.fctr Date.day.minutes Date.last2.log1p
## 382526 383258 11986 119920
## Date.last4.log1p Date.last8.log1p Date.last16.log1p Date.last32.log1p
## 41746 4572 66 32
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## Date LocationDescription LocationDescription.my
## 0 0 0
## [1] "numeric data missing in : "
## District CommunityArea Latitude Longitude
## 86110 49232 4552 4552
## [1] "numeric data w/ 0s in : "
## Domestic CommunityArea Date.wkday.fctr Date.wkend
## 382428 4 52608 276414
## Date.hlday Date.second.fctr Date.day.minutes Date.last2.log1p
## 382526 383258 11986 119920
## Date.last4.log1p Date.last8.log1p Date.last16.log1p Date.last32.log1p
## 41746 4572 66 32
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## Date LocationDescription LocationDescription.my
## 0 0 0
## label step_major step_minor label_minor bgn end
## 12 manage.missing.data 4 0 0 491.507 494.682
## 13 cluster.data 5 0 0 494.682 NA
## elapsed
## 12 3.175
## 13 NA
5.0: cluster data## label step_major step_minor label_minor bgn
## 13 cluster.data 5 0 0 494.682
## 14 partition.data.training 6 0 0 495.580
## end elapsed
## 13 495.58 0.898
## 14 NA NA
6.0: partition data training## [1] "partition.data.training chunk: setup: elapsed: 0.00 secs"
## [1] "Newdata contains non-NA data for Arrest; setting OOB to Newdata"
## [1] "partition.data.training chunk: Fit/OOB partition complete: elapsed: 0.01 secs"
## Arrest.F Arrest.T
## OOB 352186 31072
## Arrest.F Arrest.T
## OOB 0.9189267 0.08107332
## LocationDescription.my .n.Fit .n.OOB .n.Tst .freqRatio.Fit
## 9 STREET 156553 156553 156553 0.816958811
## 7 PARKING LOT/GARAGE(NON.RESID.) 14852 14852 14852 0.077503927
## 8 Residence 5891 5891 5891 0.030741694
## 6 Other 5206 5206 5206 0.027167078
## 1 ALLEY 2307 2307 2307 0.012038888
## 4 GAS STATION 2111 2111 2111 0.011016078
## 12 VACANT LOT/LAND 985 985 985 0.005140141
## 13 VEHICLE NON-COMMERCIAL 817 817 817 0.004263447
## 3 Entertainment 651 651 651 0.003397189
## 2 CommercialVehicle 648 648 648 0.003381534
## 11 Sidewalk 463 463 463 0.002416127
## 10 School 415 415 415 0.002165643
## 14 cha 410 410 410 0.002139551
## 5 Government 320 320 320 0.001669893
## .freqRatio.OOB .freqRatio.Tst
## 9 0.816958811 0.816958811
## 7 0.077503927 0.077503927
## 8 0.030741694 0.030741694
## 6 0.027167078 0.027167078
## 1 0.012038888 0.012038888
## 4 0.011016078 0.011016078
## 12 0.005140141 0.005140141
## 13 0.004263447 0.004263447
## 3 0.003397189 0.003397189
## 2 0.003381534 0.003381534
## 11 0.002416127 0.002416127
## 10 0.002165643 0.002165643
## 14 0.002139551 0.002139551
## 5 0.001669893 0.001669893
## [1] "glbObsAll: "
## [1] 383258 42
## [1] "glbObsTrn: "
## [1] 191629 42
## [1] "glbObsFit: "
## [1] 191629 41
## [1] "glbObsOOB: "
## [1] 191629 41
## [1] "glbObsNew: "
## [1] 191629 41
## [1] "partition.data.training chunk: teardown: elapsed: 8.49 secs"
## label step_major step_minor label_minor bgn
## 14 partition.data.training 6 0 0 495.580
## 15 select.features 7 0 0 504.131
## end elapsed
## 14 504.13 8.55
## 15 NA NA
7.0: select features## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## [1] "cor(.pos, .pos.y)=1.0000"
## [1] "cor(Arrest, .pos)=0.0895"
## [1] "cor(Arrest, .pos.y)=0.0895"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified .pos.y as highly correlated with .pos
## [1] "cor(Date.year.fctr, Year)=1.0000"
## [1] "cor(Arrest, Date.year.fctr)=-0.0903"
## [1] "cor(Arrest, Year)=-0.0903"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Year as highly correlated with Date.year.fctr
## [1] "cor(.pos, ID)=-0.9972"
## [1] "cor(Arrest, .pos)=0.0895"
## [1] "cor(Arrest, ID)=-0.0909"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified .pos as highly correlated with ID
## [1] "cor(Date.juliandate, Date.month.fctr)=0.9965"
## [1] "cor(Arrest, Date.juliandate)=-0.0080"
## [1] "cor(Arrest, Date.month.fctr)=-0.0081"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Date.juliandate as highly correlated with
## Date.month.fctr
## [1] "cor(Date.year.fctr, ID)=0.9956"
## [1] "cor(Arrest, Date.year.fctr)=-0.0903"
## [1] "cor(Arrest, ID)=-0.0909"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glbObsTrn, : Identified Date.year.fctr as highly correlated with ID
## cor.y exclude.as.feat cor.y.abs
## .pos 0.089474127 0 0.089474127
## .pos.y 0.089474127 0 0.089474127
## Date.zoo 0.088569997 1 0.088569997
## LocationDescription.my.fctr 0.040563975 0 0.040563975
## Date.minute.fctr 0.033498618 0 0.033498618
## Domestic 0.012895481 0 0.012895481
## Date.day.minutes.poly.5 0.011356413 0 0.011356413
## Date.wkend 0.010825793 0 0.010825793
## Date.last2.log1p 0.009267662 0 0.009267662
## Date.day.minutes.poly.1 0.006248809 0 0.006248809
## Date.day.minutes 0.006248809 1 0.006248809
## Date.hour.fctr 0.004798350 0 0.004798350
## Date.day.minutes.poly.4 0.003308926 0 0.003308926
## Latitude 0.002546354 1 0.002546354
## Date.last16.log1p 0.002532701 0 0.002532701
## Date.last32.log1p 0.001996833 0 0.001996833
## Longitude 0.001868830 1 0.001868830
## Date.date.fctr 0.000479833 0 0.000479833
## Beat -0.001380185 0 0.001380185
## Date.last8.log1p -0.002512872 0 0.002512872
## CommunityArea -0.002873257 1 0.002873257
## Date.last4.log1p -0.003703223 0 0.003703223
## Date.wkday.fctr -0.003882228 0 0.003882228
## District -0.004096505 1 0.004096505
## .rnorm -0.005426180 0 0.005426180
## Date.day.minutes.poly.3 -0.007752122 0 0.007752122
## Date.juliandate -0.008033777 0 0.008033777
## Date.month.fctr -0.008129434 0 0.008129434
## Date.hlday -0.008176640 0 0.008176640
## Date.day.minutes.poly.2 -0.027015801 0 0.027015801
## District.fctr -0.038853991 0 0.038853991
## Year -0.090272965 0 0.090272965
## Date.year.fctr -0.090272965 0 0.090272965
## Date.POSIX -0.090696722 1 0.090696722
## ID -0.090859450 0 0.090859450
## Date.second.fctr NA 0 NA
## cor.high.X freqRatio percentUnique
## .pos ID 1.000000 1.000000e+02
## .pos.y .pos 1.000000 1.000000e+02
## Date.zoo <NA> 1.000000 3.361600e+01
## LocationDescription.my.fctr <NA> 10.540870 7.305784e-03
## Date.minute.fctr <NA> 2.841683 2.087367e-03
## Domestic <NA> 460.756627 1.043683e-03
## Date.day.minutes.poly.5 <NA> 1.214868 7.498865e-01
## Date.wkend <NA> 2.587080 1.043683e-03
## Date.last2.log1p <NA> 1.544017 1.774262e-01
## Date.day.minutes.poly.1 <NA> 1.214868 7.498865e-01
## Date.day.minutes <NA> 1.214868 7.498865e-01
## Date.hour.fctr <NA> 1.573103 1.565525e-03
## Date.day.minutes.poly.4 <NA> 1.214868 7.498865e-01
## Latitude <NA> 1.038462 1.728548e+01
## Date.last16.log1p <NA> 1.157359 4.268665e-01
## Date.last32.log1p <NA> 1.039599 6.027271e-01
## Longitude <NA> 1.775194 1.303717e+01
## Date.date.fctr <NA> 1.133862 2.609208e-03
## Beat <NA> 1.001856 1.565525e-01
## Date.last8.log1p <NA> 1.002906 3.073648e-01
## CommunityArea <NA> 1.516955 4.070365e-02
## Date.last4.log1p <NA> 1.385158 2.316977e-01
## Date.wkday.fctr <NA> 1.068135 3.652892e-03
## District <NA> 1.018406 1.356788e-02
## .rnorm <NA> 1.000000 9.937014e+01
## Date.day.minutes.poly.3 <NA> 1.214868 7.498865e-01
## Date.juliandate Date.month.fctr 1.001639 1.909941e-01
## Date.month.fctr <NA> 1.016963 6.262100e-03
## Date.hlday <NA> 522.576503 1.043683e-03
## Date.day.minutes.poly.2 <NA> 1.214868 7.498865e-01
## District.fctr <NA> 1.238749 2.087367e-03
## Year Date.year.fctr 1.102229 6.262100e-03
## Date.year.fctr ID 1.102229 6.262100e-03
## Date.POSIX <NA> 1.100000 6.871089e+01
## ID <NA> 1.000000 1.000000e+02
## Date.second.fctr <NA> 0.000000 5.218417e-04
## zeroVar nzv is.cor.y.abs.low
## .pos FALSE FALSE FALSE
## .pos.y FALSE FALSE FALSE
## Date.zoo FALSE FALSE FALSE
## LocationDescription.my.fctr FALSE FALSE FALSE
## Date.minute.fctr FALSE FALSE FALSE
## Domestic FALSE TRUE FALSE
## Date.day.minutes.poly.5 FALSE FALSE FALSE
## Date.wkend FALSE FALSE FALSE
## Date.last2.log1p FALSE FALSE FALSE
## Date.day.minutes.poly.1 FALSE FALSE FALSE
## Date.day.minutes FALSE FALSE FALSE
## Date.hour.fctr FALSE FALSE TRUE
## Date.day.minutes.poly.4 FALSE FALSE TRUE
## Latitude FALSE FALSE TRUE
## Date.last16.log1p FALSE FALSE TRUE
## Date.last32.log1p FALSE FALSE TRUE
## Longitude FALSE FALSE TRUE
## Date.date.fctr FALSE FALSE TRUE
## Beat FALSE FALSE TRUE
## Date.last8.log1p FALSE FALSE TRUE
## CommunityArea FALSE FALSE TRUE
## Date.last4.log1p FALSE FALSE TRUE
## Date.wkday.fctr FALSE FALSE TRUE
## District FALSE FALSE TRUE
## .rnorm FALSE FALSE FALSE
## Date.day.minutes.poly.3 FALSE FALSE FALSE
## Date.juliandate FALSE FALSE FALSE
## Date.month.fctr FALSE FALSE FALSE
## Date.hlday FALSE TRUE FALSE
## Date.day.minutes.poly.2 FALSE FALSE FALSE
## District.fctr FALSE FALSE FALSE
## Year FALSE FALSE FALSE
## Date.year.fctr FALSE FALSE FALSE
## Date.POSIX FALSE FALSE FALSE
## ID FALSE FALSE FALSE
## Date.second.fctr TRUE TRUE NA
## Warning in myplot_scatter(plt_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "nzv", : converting nzv to class:factor
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## cor.y exclude.as.feat cor.y.abs cor.high.X
## Domestic 0.01289548 0 0.01289548 <NA>
## Date.hlday -0.00817664 0 0.00817664 <NA>
## Date.second.fctr NA 0 NA <NA>
## freqRatio percentUnique zeroVar nzv is.cor.y.abs.low
## Domestic 460.7566 0.0010436834 FALSE TRUE FALSE
## Date.hlday 522.5765 0.0010436834 FALSE TRUE FALSE
## Date.second.fctr 0.0000 0.0005218417 TRUE TRUE NA
## [1] "numeric data missing in : "
## District CommunityArea Latitude Longitude
## 86110 49232 4552 4552
## [1] "numeric data w/ 0s in : "
## Domestic CommunityArea Date.wkday.fctr Date.wkend
## 382428 4 52608 276414
## Date.hlday Date.second.fctr Date.day.minutes Date.last2.log1p
## 382526 383258 11986 119920
## Date.last4.log1p Date.last8.log1p Date.last16.log1p Date.last32.log1p
## 41746 4572 66 32
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## Date LocationDescription LocationDescription.my
## 0 0 0
## .lcn
## 0
## [1] "glb_feats_df:"
## [1] 36 12
## id exclude.as.feat rsp_var
## Arrest Arrest TRUE TRUE
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## ID ID -0.09085945 FALSE 0.09085945 <NA> 1
## Arrest Arrest NA TRUE NA <NA> NA
## percentUnique zeroVar nzv is.cor.y.abs.low interaction.feat
## ID 100 FALSE FALSE FALSE NA
## Arrest NA NA NA NA NA
## shapiro.test.p.value rsp_var_raw id_var rsp_var
## ID 1.66718e-42 FALSE TRUE NA
## Arrest NA NA NA TRUE
## [1] "glb_feats_df vs. glbObsAll: "
## character(0)
## [1] "glbObsAll vs. glb_feats_df: "
## character(0)
## label step_major step_minor label_minor bgn end
## 15 select.features 7 0 0 504.131 552.914
## 16 fit.models 8 0 0 552.914 NA
## elapsed
## 15 48.783
## 16 NA
8.0: fit modelsfit.models_0_chunk_df <- myadd_chunk(NULL, "fit.models_0_bgn", label.minor = "setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_0_bgn 1 0 setup 553.656 NA NA
# load(paste0(glbOut$pfx, "dsk.RData"))
get_model_sel_frmla <- function() {
model_evl_terms <- c(NULL)
# min.aic.fit might not be avl
lclMdlEvlCriteria <-
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)]
for (metric in lclMdlEvlCriteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse = " "))
return(model_sel_frmla)
}
get_dsp_models_df <- function() {
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
dsp_models_df <-
#orderBy(get_model_sel_frmla(), glb_models_df)[, c("id", glbMdlMetricsEval)]
orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols]
nCvMdl <- sapply(glb_models_lst, function(mdl) nrow(mdl$results))
nParams <- sapply(glb_models_lst, function(mdl) ifelse(mdl$method == "custom", 0,
nrow(subset(modelLookup(mdl$method), parameter != "parameter"))))
# nCvMdl <- nCvMdl[names(nCvMdl) != "avNNet"]
# nParams <- nParams[names(nParams) != "avNNet"]
if (length(cvMdlProblems <- nCvMdl[nCvMdl <= nParams]) > 0) {
print("Cross Validation issues:")
warning("Cross Validation issues:")
print(cvMdlProblems)
}
pltMdls <- setdiff(names(nCvMdl), names(cvMdlProblems))
pltMdls <- setdiff(pltMdls, names(nParams[nParams == 0]))
# length(pltMdls) == 21
png(paste0(glbOut$pfx, "bestTune.png"), width = 480 * 2, height = 480 * 4)
grid.newpage()
pushViewport(viewport(layout = grid.layout(ceiling(length(pltMdls) / 2.0), 2)))
pltIx <- 1
for (mdlId in pltMdls) {
print(ggplot(glb_models_lst[[mdlId]], highBestTune = TRUE) + labs(title = mdlId),
vp = viewport(layout.pos.row = ceiling(pltIx / 2.0),
layout.pos.col = ((pltIx - 1) %% 2) + 1))
pltIx <- pltIx + 1
}
dev.off()
if (all(row.names(dsp_models_df) != dsp_models_df$id))
row.names(dsp_models_df) <- dsp_models_df$id
return(dsp_models_df)
}
#get_dsp_models_df()
if (glb_is_classification && glb_is_binomial &&
(length(unique(glbObsFit[, glb_rsp_var])) < 2))
stop("glbObsFit$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glbObsFit[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !nzv & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
max_cor_y_x_vars <- max_cor_y_x_vars[!is.na(max_cor_y_x_vars)]
if (length(max_cor_y_x_vars) < 2)
max_cor_y_x_vars <- union(max_cor_y_x_vars, ".pos")
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[glb_feats_df$id == max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_feats_df$id == glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a higher correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Model specs
# c("id.prefix", "method", "type",
# # trainControl params
# "preProc.method", "cv.n.folds", "cv.n.repeats", "summary.fn",
# # train params
# "metric", "metric.maximize", "tune.df")
# Baseline
if (!is.null(glb_Baseline_mdl_var)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Baseline"), major.inc = FALSE,
label.minor = "mybaseln_classfr")
ret_lst <- myfit_mdl(mdl_id="Baseline",
model_method="mybaseln_classfr",
indepVar=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
if (glb_is_classification) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "MFO"), major.inc = FALSE,
label.minor = "myMFO_classfr")
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "MFO", type = glb_model_type, trainControl.method = "none",
train.method = ifelse(glb_is_regression, "lm", "myMFO_classfr"))),
indepVar = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Random"), major.inc = FALSE,
label.minor = "myrandom_classfr")
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Random", type = glb_model_type, trainControl.method = "none",
train.method = "myrandom_classfr")),
indepVar = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## label step_major step_minor label_minor bgn end
## 1 fit.models_0_bgn 1 0 setup 553.656 553.693
## 2 fit.models_0_MFO 1 1 myMFO_classfr 553.693 NA
## elapsed
## 1 0.037
## 2 NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: MFO###myMFO_classfr"
## [1] " indepVar: .rnorm"
## [1] "myfit_mdl: setup complete: 0.444000 secs"
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] F T
## Levels: F T
## [1] "unique.prob:"
## y
## F T
## 0.91892668 0.08107332
## [1] "MFO.val:"
## [1] "F"
## [1] "myfit_mdl: train complete: 2.181000 secs"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 -none- numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] "myfit_mdl: train diagnostics complete: 2.183000 secs"
## Loading required namespace: pROC
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## [1] "in MFO.Classifier$prob"
## F T
## 1 0.9189267 0.08107332
## 2 0.9189267 0.08107332
## 3 0.9189267 0.08107332
## 4 0.9189267 0.08107332
## 5 0.9189267 0.08107332
## 6 0.9189267 0.08107332
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## [1] "in MFO.Classifier$prob"
## F T
## 1 0.9189267 0.08107332
## 2 0.9189267 0.08107332
## 3 0.9189267 0.08107332
## 4 0.9189267 0.08107332
## 5 0.9189267 0.08107332
## 6 0.9189267 0.08107332
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 45.374000 secs"
## id feats max.nTuningRuns min.elapsedtime.everything
## 1 MFO###myMFO_classfr .rnorm 0 1.46
## min.elapsedtime.final max.AUCpROC.fit max.Sens.fit max.Spec.fit
## 1 0.043 0.5 1 0
## max.AUCROCR.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0 0.1499867 0.08107332
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.07985499 0.08230435 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0 0.1499867 0.08107332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.07985499 0.08230435 0
## [1] "myfit_mdl: exit: 45.384000 secs"
## label step_major step_minor label_minor bgn
## 2 fit.models_0_MFO 1 1 myMFO_classfr 553.693
## 3 fit.models_0_Random 1 2 myrandom_classfr 599.084
## end elapsed
## 2 599.083 45.391
## 3 NA NA
## [1] "myfit_mdl: enter: 0.001000 secs"
## [1] "fitting model: Random###myrandom_classfr"
## [1] " indepVar: .rnorm"
## [1] "myfit_mdl: setup complete: 0.471000 secs"
## Fitting parameter = none on full training set
## [1] "myfit_mdl: train complete: 1.307000 secs"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] "myfit_mdl: train diagnostics complete: 1.308000 secs"
## [1] "in Random.Classifier$prob"
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "in Random.Classifier$prob"
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 582.354000 secs"
## id feats max.nTuningRuns
## 1 Random###myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 0.608 0.042 0.5008987
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.9191507 0.08264676 0.4996512 0
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.1499867 0.08107332 0.07985499
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.08230435 0 0.4998215 0.9191847
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.08045829 0.4997988 0 0.1499867
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.08107332 0.07985499 0.08230435
## max.Kappa.OOB
## 1 0
## [1] "myfit_mdl: exit: 582.367000 secs"
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.rcv.*X*"), major.inc = FALSE,
label.minor = "glmnet")
## label step_major step_minor label_minor
## 3 fit.models_0_Random 1 2 myrandom_classfr
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3 glmnet
## bgn end elapsed
## 3 599.084 1181.463 582.379
## 4 1181.463 NA NA
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.rcv.1X1", type = glb_model_type, trainControl.method = "none",
train.method = "glmnet")),
indepVar = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: Max.cor.Y.rcv.1X1###glmnet"
## [1] " indepVar: ID,LocationDescription.my.fctr"
## [1] "myfit_mdl: setup complete: 0.741000 secs"
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 2.0-5
## Fitting alpha = 0.1, lambda = 0.000496 on full training set
## [1] "myfit_mdl: train complete: 6.052000 secs"
## Length Class Mode
## a0 52 -none- numeric
## beta 728 dgCMatrix S4
## df 52 -none- numeric
## dim 2 -none- numeric
## lambda 52 -none- numeric
## dev.ratio 52 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 14 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -1.868051e+00
## ID
## -1.387184e-07
## LocationDescription.my.fctrALLEY
## 3.290246e-01
## LocationDescription.my.fctrCommercialVehicle
## 1.018814e+00
## LocationDescription.my.fctrEntertainment
## 5.172409e-01
## LocationDescription.my.fctrGAS STATION
## 1.114496e+00
## LocationDescription.my.fctrGovernment
## 8.532385e-01
## LocationDescription.my.fctrOther
## 3.823671e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 3.208419e-01
## LocationDescription.my.fctrResidence
## 2.779245e-01
## LocationDescription.my.fctrSchool
## 7.560449e-01
## LocationDescription.my.fctrSidewalk
## 1.803735e-01
## LocationDescription.my.fctrVACANT LOT/LAND
## -7.232414e-02
## LocationDescription.my.fctrVEHICLE NON-COMMERCIAL
## 1.691812e-01
## LocationDescription.my.fctrcha
## -1.434990e-01
## [1] "max lambda < lambdaOpt:"
## [1] "Feats mismatch between coefs_left & rght:"
## [1] "(Intercept)"
## [2] "ID"
## [3] "LocationDescription.my.fctrALLEY"
## [4] "LocationDescription.my.fctrCommercialVehicle"
## [5] "LocationDescription.my.fctrEntertainment"
## [6] "LocationDescription.my.fctrGAS STATION"
## [7] "LocationDescription.my.fctrGovernment"
## [8] "LocationDescription.my.fctrOther"
## [9] "LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)"
## [10] "LocationDescription.my.fctrResidence"
## [11] "LocationDescription.my.fctrSchool"
## [12] "LocationDescription.my.fctrSidewalk"
## [13] "LocationDescription.my.fctrVACANT LOT/LAND"
## [14] "LocationDescription.my.fctrVEHICLE NON-COMMERCIAL"
## [15] "LocationDescription.my.fctrcha"
## [1] "myfit_mdl: train diagnostics complete: 6.162000 secs"
## Prediction
## Reference F T
## F 134858 41235
## T 10125 5411
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7319821 0.0596594 0.7299929 0.7339643 0.9189267
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## Prediction
## Reference F T
## F 134858 41235
## T 10125 5411
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7319821 0.0596594 0.7299929 0.7339643 0.9189267
## AccuracyPValue McnemarPValue
## 1.0000000 0.0000000
## [1] "myfit_mdl: predict complete: 256.749000 secs"
## id feats
## 1 Max.cor.Y.rcv.1X1###glmnet ID,LocationDescription.my.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 4.977 3.256
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.6119694
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.1740375 0.7319821
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7299929 0.7339643 0.0596594
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.6119694
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.1740375 0.7319821
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7299929 0.7339643 0.0596594
## [1] "myfit_mdl: exit: 256.762000 secs"
if (glbMdlCheckRcv) {
# rcv_n_folds == 1 & rcv_n_repeats > 1 crashes
for (rcv_n_folds in seq(3, glb_rcv_n_folds + 2, 2))
for (rcv_n_repeats in seq(1, glb_rcv_n_repeats + 2, 2)) {
# Experiment specific code to avoid caret crash
# lcl_tune_models_df <- rbind(data.frame()
# ,data.frame(method = "glmnet", parameter = "alpha",
# vals = "0.100 0.325 0.550 0.775 1.000")
# ,data.frame(method = "glmnet", parameter = "lambda",
# vals = "9.342e-02")
# )
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
list(
id.prefix = paste0("Max.cor.Y.rcv.", rcv_n_folds, "X", rcv_n_repeats),
type = glb_model_type,
# tune.df = lcl_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = rcv_n_folds,
trainControl.repeats = rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.method = "glmnet", train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize)),
indepVar = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
# Add parallel coordinates graph of glb_models_df[, glbMdlMetricsEval] to evaluate cv parameters
tmp_models_cols <- c("id", "max.nTuningRuns",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
print(myplot_parcoord(obs_df = subset(glb_models_df,
grepl("Max.cor.Y.rcv.", id, fixed = TRUE),
select = -feats)[, tmp_models_cols],
id_var = "id"))
}
# Useful for stacking decisions
# fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
# paste0("fit.models_0_", "Max.cor.Y[rcv.1X1.cp.0|]"), major.inc = FALSE,
# label.minor = "rpart")
#
# ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
# id.prefix = "Max.cor.Y.rcv.1X1.cp.0", type = glb_model_type, trainControl.method = "none",
# train.method = "rpart",
# tune.df=data.frame(method="rpart", parameter="cp", min=0.0, max=0.0, by=0.1))),
# indepVar=max_cor_y_x_vars, rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB)
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
# if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "rpart")),
indepVar = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: Max.cor.Y##rcv#rpart"
## [1] " indepVar: ID,LocationDescription.my.fctr"
## [1] "myfit_mdl: setup complete: 0.739000 secs"
## Loading required package: rpart
## + Fold1.Rep1: cp=2.146e-05
## - Fold1.Rep1: cp=2.146e-05
## + Fold2.Rep1: cp=2.146e-05
## - Fold2.Rep1: cp=2.146e-05
## + Fold3.Rep1: cp=2.146e-05
## - Fold3.Rep1: cp=2.146e-05
## + Fold1.Rep2: cp=2.146e-05
## - Fold1.Rep2: cp=2.146e-05
## + Fold2.Rep2: cp=2.146e-05
## - Fold2.Rep2: cp=2.146e-05
## + Fold3.Rep2: cp=2.146e-05
## - Fold3.Rep2: cp=2.146e-05
## + Fold1.Rep3: cp=2.146e-05
## - Fold1.Rep3: cp=2.146e-05
## + Fold2.Rep3: cp=2.146e-05
## - Fold2.Rep3: cp=2.146e-05
## + Fold3.Rep3: cp=2.146e-05
## - Fold3.Rep3: cp=2.146e-05
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 4.92e-05 on full training set
## [1] "myfit_mdl: train complete: 79.850000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y", : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 191629
##
## CP nsplit rel error
## 1 4.922154e-05 0 1
##
## Node number 1: 191629 observations
## predicted class=F expected loss=0.08107332 P(node) =1
## class counts: 176093 15536
## probabilities: 0.919 0.081
##
## n= 191629
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 191629 15536 F (0.91892668 0.08107332) *
## [1] "myfit_mdl: train diagnostics complete: 80.462000 secs"
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 123.795000 secs"
## id feats max.nTuningRuns
## 1 Max.cor.Y##rcv#rpart ID,LocationDescription.my.fctr 5
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 78.814 3.236 0.5
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 1 0 0.5 0
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.1499867 0.9185527 0.07985499
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.08230435 0.002468544 0.5 1
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0 0.5 0 0.1499867
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.08107332 0.07985499 0.08230435
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0 0.0001590894 0.001228904
## [1] "myfit_mdl: exit: 123.810000 secs"
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Poly"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.day\\.minutes\\.poly\\.", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Poly",
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## label step_major step_minor label_minor
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3 glmnet
## 5 fit.models_0_Max.cor.Y.Time.Poly 1 4 glmnet
## bgn end elapsed
## 4 1181.463 1570.016 388.554
## 5 1570.017 NA NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: Max.cor.Y.Time.Poly##rcv#glmnet"
## [1] " indepVar: ID,LocationDescription.my.fctr,Date.day.minutes.poly.1,Date.day.minutes.poly.2,Date.day.minutes.poly.3,Date.day.minutes.poly.4,Date.day.minutes.poly.5"
## [1] "myfit_mdl: setup complete: 0.825000 secs"
## + Fold1.Rep1: alpha=0.100, lambda=0.01069
## - Fold1.Rep1: alpha=0.100, lambda=0.01069
## + Fold1.Rep1: alpha=0.325, lambda=0.01069
## - Fold1.Rep1: alpha=0.325, lambda=0.01069
## + Fold1.Rep1: alpha=0.550, lambda=0.01069
## - Fold1.Rep1: alpha=0.550, lambda=0.01069
## + Fold1.Rep1: alpha=0.775, lambda=0.01069
## - Fold1.Rep1: alpha=0.775, lambda=0.01069
## + Fold1.Rep1: alpha=1.000, lambda=0.01069
## - Fold1.Rep1: alpha=1.000, lambda=0.01069
## + Fold2.Rep1: alpha=0.100, lambda=0.01069
## - Fold2.Rep1: alpha=0.100, lambda=0.01069
## + Fold2.Rep1: alpha=0.325, lambda=0.01069
## - Fold2.Rep1: alpha=0.325, lambda=0.01069
## + Fold2.Rep1: alpha=0.550, lambda=0.01069
## - Fold2.Rep1: alpha=0.550, lambda=0.01069
## + Fold2.Rep1: alpha=0.775, lambda=0.01069
## - Fold2.Rep1: alpha=0.775, lambda=0.01069
## + Fold2.Rep1: alpha=1.000, lambda=0.01069
## - Fold2.Rep1: alpha=1.000, lambda=0.01069
## + Fold3.Rep1: alpha=0.100, lambda=0.01069
## - Fold3.Rep1: alpha=0.100, lambda=0.01069
## + Fold3.Rep1: alpha=0.325, lambda=0.01069
## - Fold3.Rep1: alpha=0.325, lambda=0.01069
## + Fold3.Rep1: alpha=0.550, lambda=0.01069
## - Fold3.Rep1: alpha=0.550, lambda=0.01069
## + Fold3.Rep1: alpha=0.775, lambda=0.01069
## - Fold3.Rep1: alpha=0.775, lambda=0.01069
## + Fold3.Rep1: alpha=1.000, lambda=0.01069
## - Fold3.Rep1: alpha=1.000, lambda=0.01069
## + Fold1.Rep2: alpha=0.100, lambda=0.01069
## - Fold1.Rep2: alpha=0.100, lambda=0.01069
## + Fold1.Rep2: alpha=0.325, lambda=0.01069
## - Fold1.Rep2: alpha=0.325, lambda=0.01069
## + Fold1.Rep2: alpha=0.550, lambda=0.01069
## - Fold1.Rep2: alpha=0.550, lambda=0.01069
## + Fold1.Rep2: alpha=0.775, lambda=0.01069
## - Fold1.Rep2: alpha=0.775, lambda=0.01069
## + Fold1.Rep2: alpha=1.000, lambda=0.01069
## - Fold1.Rep2: alpha=1.000, lambda=0.01069
## + Fold2.Rep2: alpha=0.100, lambda=0.01069
## - Fold2.Rep2: alpha=0.100, lambda=0.01069
## + Fold2.Rep2: alpha=0.325, lambda=0.01069
## - Fold2.Rep2: alpha=0.325, lambda=0.01069
## + Fold2.Rep2: alpha=0.550, lambda=0.01069
## - Fold2.Rep2: alpha=0.550, lambda=0.01069
## + Fold2.Rep2: alpha=0.775, lambda=0.01069
## - Fold2.Rep2: alpha=0.775, lambda=0.01069
## + Fold2.Rep2: alpha=1.000, lambda=0.01069
## - Fold2.Rep2: alpha=1.000, lambda=0.01069
## + Fold3.Rep2: alpha=0.100, lambda=0.01069
## - Fold3.Rep2: alpha=0.100, lambda=0.01069
## + Fold3.Rep2: alpha=0.325, lambda=0.01069
## - Fold3.Rep2: alpha=0.325, lambda=0.01069
## + Fold3.Rep2: alpha=0.550, lambda=0.01069
## - Fold3.Rep2: alpha=0.550, lambda=0.01069
## + Fold3.Rep2: alpha=0.775, lambda=0.01069
## - Fold3.Rep2: alpha=0.775, lambda=0.01069
## + Fold3.Rep2: alpha=1.000, lambda=0.01069
## - Fold3.Rep2: alpha=1.000, lambda=0.01069
## + Fold1.Rep3: alpha=0.100, lambda=0.01069
## - Fold1.Rep3: alpha=0.100, lambda=0.01069
## + Fold1.Rep3: alpha=0.325, lambda=0.01069
## - Fold1.Rep3: alpha=0.325, lambda=0.01069
## + Fold1.Rep3: alpha=0.550, lambda=0.01069
## - Fold1.Rep3: alpha=0.550, lambda=0.01069
## + Fold1.Rep3: alpha=0.775, lambda=0.01069
## - Fold1.Rep3: alpha=0.775, lambda=0.01069
## + Fold1.Rep3: alpha=1.000, lambda=0.01069
## - Fold1.Rep3: alpha=1.000, lambda=0.01069
## + Fold2.Rep3: alpha=0.100, lambda=0.01069
## - Fold2.Rep3: alpha=0.100, lambda=0.01069
## + Fold2.Rep3: alpha=0.325, lambda=0.01069
## - Fold2.Rep3: alpha=0.325, lambda=0.01069
## + Fold2.Rep3: alpha=0.550, lambda=0.01069
## - Fold2.Rep3: alpha=0.550, lambda=0.01069
## + Fold2.Rep3: alpha=0.775, lambda=0.01069
## - Fold2.Rep3: alpha=0.775, lambda=0.01069
## + Fold2.Rep3: alpha=1.000, lambda=0.01069
## - Fold2.Rep3: alpha=1.000, lambda=0.01069
## + Fold3.Rep3: alpha=0.100, lambda=0.01069
## - Fold3.Rep3: alpha=0.100, lambda=0.01069
## + Fold3.Rep3: alpha=0.325, lambda=0.01069
## - Fold3.Rep3: alpha=0.325, lambda=0.01069
## + Fold3.Rep3: alpha=0.550, lambda=0.01069
## - Fold3.Rep3: alpha=0.550, lambda=0.01069
## + Fold3.Rep3: alpha=0.775, lambda=0.01069
## - Fold3.Rep3: alpha=0.775, lambda=0.01069
## + Fold3.Rep3: alpha=1.000, lambda=0.01069
## - Fold3.Rep3: alpha=1.000, lambda=0.01069
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0107 on full training set
## [1] "myfit_mdl: train complete: 206.835000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Poly", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Poly", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 52 -none- numeric
## beta 988 dgCMatrix S4
## df 52 -none- numeric
## dim 2 -none- numeric
## lambda 52 -none- numeric
## dev.ratio 52 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 19 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -1.928136e+00
## Date.day.minutes.poly.1
## 4.132103e+00
## Date.day.minutes.poly.2
## -4.070520e+01
## Date.day.minutes.poly.3
## -4.723390e+00
## Date.day.minutes.poly.5
## 1.052921e+01
## ID
## -1.194850e-07
## LocationDescription.my.fctrALLEY
## 1.907031e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.095224e-01
## LocationDescription.my.fctrEntertainment
## 2.844510e-01
## LocationDescription.my.fctrGAS STATION
## 9.802804e-01
## LocationDescription.my.fctrGovernment
## 5.954836e-01
## LocationDescription.my.fctrOther
## 2.640365e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.261467e-01
## LocationDescription.my.fctrResidence
## 1.569371e-01
## LocationDescription.my.fctrSchool
## 5.070140e-01
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -1.921925e+00
## Date.day.minutes.poly.1
## 4.969140e+00
## Date.day.minutes.poly.2
## -4.173884e+01
## Date.day.minutes.poly.3
## -5.426019e+00
## Date.day.minutes.poly.5
## 1.132112e+01
## ID
## -1.213892e-07
## LocationDescription.my.fctrALLEY
## 2.032875e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.275865e-01
## LocationDescription.my.fctrEntertainment
## 3.064957e-01
## LocationDescription.my.fctrGAS STATION
## 9.935201e-01
## LocationDescription.my.fctrGovernment
## 6.210328e-01
## LocationDescription.my.fctrOther
## 2.729942e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.320208e-01
## LocationDescription.my.fctrResidence
## 1.684215e-01
## LocationDescription.my.fctrSchool
## 5.288919e-01
## [1] "myfit_mdl: train diagnostics complete: 207.428000 secs"
## Prediction
## Reference F T
## F 140098 35995
## T 10530 5006
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.75721316 0.06742643 0.75528694 0.75913159 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## Prediction
## Reference F T
## F 140098 35995
## T 10530 5006
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.75721316 0.06742643 0.75528694 0.75913159 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 440.762000 secs"
## id
## 1 Max.cor.Y.Time.Poly##rcv#glmnet
## feats
## 1 ID,LocationDescription.my.fctr,Date.day.minutes.poly.1,Date.day.minutes.poly.2,Date.day.minutes.poly.3,Date.day.minutes.poly.4,Date.day.minutes.poly.5
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 20 205.599 4.4
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.615125
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.1770876 0.9189267
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7552869 0.7591316 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.615125
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.1770876 0.7572132
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7552869 0.7591316 0.06742643
## max.AccuracySD.fit max.KappaSD.fit
## 1 7.220992e-06 0
## [1] "myfit_mdl: exit: 440.776000 secs"
if ((length(glbFeatsDateTime) > 0) &&
(sum(grepl(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll))) > 0)) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Max.cor.Y.Time.Lag"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars,
grep(paste(names(glbFeatsDateTime), "\\.last[[:digit:]]", sep = ""),
names(glbObsAll), value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Time.Lag",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
## label step_major step_minor label_minor
## 5 fit.models_0_Max.cor.Y.Time.Poly 1 4 glmnet
## 6 fit.models_0_Max.cor.Y.Time.Lag 1 5 glmnet
## bgn end elapsed
## 5 1570.017 2016.028 446.011
## 6 2016.029 NA NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: Max.cor.Y.Time.Lag##rcv#glmnet"
## [1] " indepVar: ID,LocationDescription.my.fctr,Date.last2.log1p,Date.last4.log1p,Date.last8.log1p,Date.last16.log1p,Date.last32.log1p"
## [1] "myfit_mdl: setup complete: 0.723000 secs"
## + Fold1.Rep1: alpha=0.100, lambda=0.01069
## - Fold1.Rep1: alpha=0.100, lambda=0.01069
## + Fold1.Rep1: alpha=0.325, lambda=0.01069
## - Fold1.Rep1: alpha=0.325, lambda=0.01069
## + Fold1.Rep1: alpha=0.550, lambda=0.01069
## - Fold1.Rep1: alpha=0.550, lambda=0.01069
## + Fold1.Rep1: alpha=0.775, lambda=0.01069
## - Fold1.Rep1: alpha=0.775, lambda=0.01069
## + Fold1.Rep1: alpha=1.000, lambda=0.01069
## - Fold1.Rep1: alpha=1.000, lambda=0.01069
## + Fold2.Rep1: alpha=0.100, lambda=0.01069
## - Fold2.Rep1: alpha=0.100, lambda=0.01069
## + Fold2.Rep1: alpha=0.325, lambda=0.01069
## - Fold2.Rep1: alpha=0.325, lambda=0.01069
## + Fold2.Rep1: alpha=0.550, lambda=0.01069
## - Fold2.Rep1: alpha=0.550, lambda=0.01069
## + Fold2.Rep1: alpha=0.775, lambda=0.01069
## - Fold2.Rep1: alpha=0.775, lambda=0.01069
## + Fold2.Rep1: alpha=1.000, lambda=0.01069
## - Fold2.Rep1: alpha=1.000, lambda=0.01069
## + Fold3.Rep1: alpha=0.100, lambda=0.01069
## - Fold3.Rep1: alpha=0.100, lambda=0.01069
## + Fold3.Rep1: alpha=0.325, lambda=0.01069
## - Fold3.Rep1: alpha=0.325, lambda=0.01069
## + Fold3.Rep1: alpha=0.550, lambda=0.01069
## - Fold3.Rep1: alpha=0.550, lambda=0.01069
## + Fold3.Rep1: alpha=0.775, lambda=0.01069
## - Fold3.Rep1: alpha=0.775, lambda=0.01069
## + Fold3.Rep1: alpha=1.000, lambda=0.01069
## - Fold3.Rep1: alpha=1.000, lambda=0.01069
## + Fold1.Rep2: alpha=0.100, lambda=0.01069
## - Fold1.Rep2: alpha=0.100, lambda=0.01069
## + Fold1.Rep2: alpha=0.325, lambda=0.01069
## - Fold1.Rep2: alpha=0.325, lambda=0.01069
## + Fold1.Rep2: alpha=0.550, lambda=0.01069
## - Fold1.Rep2: alpha=0.550, lambda=0.01069
## + Fold1.Rep2: alpha=0.775, lambda=0.01069
## - Fold1.Rep2: alpha=0.775, lambda=0.01069
## + Fold1.Rep2: alpha=1.000, lambda=0.01069
## - Fold1.Rep2: alpha=1.000, lambda=0.01069
## + Fold2.Rep2: alpha=0.100, lambda=0.01069
## - Fold2.Rep2: alpha=0.100, lambda=0.01069
## + Fold2.Rep2: alpha=0.325, lambda=0.01069
## - Fold2.Rep2: alpha=0.325, lambda=0.01069
## + Fold2.Rep2: alpha=0.550, lambda=0.01069
## - Fold2.Rep2: alpha=0.550, lambda=0.01069
## + Fold2.Rep2: alpha=0.775, lambda=0.01069
## - Fold2.Rep2: alpha=0.775, lambda=0.01069
## + Fold2.Rep2: alpha=1.000, lambda=0.01069
## - Fold2.Rep2: alpha=1.000, lambda=0.01069
## + Fold3.Rep2: alpha=0.100, lambda=0.01069
## - Fold3.Rep2: alpha=0.100, lambda=0.01069
## + Fold3.Rep2: alpha=0.325, lambda=0.01069
## - Fold3.Rep2: alpha=0.325, lambda=0.01069
## + Fold3.Rep2: alpha=0.550, lambda=0.01069
## - Fold3.Rep2: alpha=0.550, lambda=0.01069
## + Fold3.Rep2: alpha=0.775, lambda=0.01069
## - Fold3.Rep2: alpha=0.775, lambda=0.01069
## + Fold3.Rep2: alpha=1.000, lambda=0.01069
## - Fold3.Rep2: alpha=1.000, lambda=0.01069
## + Fold1.Rep3: alpha=0.100, lambda=0.01069
## - Fold1.Rep3: alpha=0.100, lambda=0.01069
## + Fold1.Rep3: alpha=0.325, lambda=0.01069
## - Fold1.Rep3: alpha=0.325, lambda=0.01069
## + Fold1.Rep3: alpha=0.550, lambda=0.01069
## - Fold1.Rep3: alpha=0.550, lambda=0.01069
## + Fold1.Rep3: alpha=0.775, lambda=0.01069
## - Fold1.Rep3: alpha=0.775, lambda=0.01069
## + Fold1.Rep3: alpha=1.000, lambda=0.01069
## - Fold1.Rep3: alpha=1.000, lambda=0.01069
## + Fold2.Rep3: alpha=0.100, lambda=0.01069
## - Fold2.Rep3: alpha=0.100, lambda=0.01069
## + Fold2.Rep3: alpha=0.325, lambda=0.01069
## - Fold2.Rep3: alpha=0.325, lambda=0.01069
## + Fold2.Rep3: alpha=0.550, lambda=0.01069
## - Fold2.Rep3: alpha=0.550, lambda=0.01069
## + Fold2.Rep3: alpha=0.775, lambda=0.01069
## - Fold2.Rep3: alpha=0.775, lambda=0.01069
## + Fold2.Rep3: alpha=1.000, lambda=0.01069
## - Fold2.Rep3: alpha=1.000, lambda=0.01069
## + Fold3.Rep3: alpha=0.100, lambda=0.01069
## - Fold3.Rep3: alpha=0.100, lambda=0.01069
## + Fold3.Rep3: alpha=0.325, lambda=0.01069
## - Fold3.Rep3: alpha=0.325, lambda=0.01069
## + Fold3.Rep3: alpha=0.550, lambda=0.01069
## - Fold3.Rep3: alpha=0.550, lambda=0.01069
## + Fold3.Rep3: alpha=0.775, lambda=0.01069
## - Fold3.Rep3: alpha=0.775, lambda=0.01069
## + Fold3.Rep3: alpha=1.000, lambda=0.01069
## - Fold3.Rep3: alpha=1.000, lambda=0.01069
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0107 on full training set
## [1] "myfit_mdl: train complete: 216.847000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y.Time.Lag", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 54 -none- numeric
## beta 1026 dgCMatrix S4
## df 54 -none- numeric
## dim 2 -none- numeric
## lambda 54 -none- numeric
## dev.ratio 54 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 19 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.264862e+00
## Date.last16.log1p
## 9.950681e-03
## Date.last2.log1p
## 8.291459e-03
## Date.last32.log1p
## 1.182414e-01
## Date.last4.log1p
## -6.602518e-04
## ID
## -1.253247e-07
## LocationDescription.my.fctrALLEY
## 1.975375e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.246846e-01
## LocationDescription.my.fctrEntertainment
## 2.970899e-01
## LocationDescription.my.fctrGAS STATION
## 9.816650e-01
## LocationDescription.my.fctrGovernment
## 5.982815e-01
## LocationDescription.my.fctrOther
## 2.758487e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.402532e-01
## LocationDescription.my.fctrResidence
## 1.578326e-01
## LocationDescription.my.fctrSchool
## 5.253883e-01
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.326641e+00
## Date.last16.log1p
## 1.131586e-02
## Date.last2.log1p
## 9.205504e-03
## Date.last32.log1p
## 1.250657e-01
## Date.last4.log1p
## -3.103144e-03
## ID
## -1.276264e-07
## LocationDescription.my.fctrALLEY
## 2.102357e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.422321e-01
## LocationDescription.my.fctrEntertainment
## 3.197093e-01
## LocationDescription.my.fctrGAS STATION
## 9.948049e-01
## LocationDescription.my.fctrGovernment
## 6.237895e-01
## LocationDescription.my.fctrOther
## 2.850100e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.463156e-01
## LocationDescription.my.fctrResidence
## 1.692871e-01
## LocationDescription.my.fctrSchool
## 5.465821e-01
## [1] "myfit_mdl: train diagnostics complete: 217.410000 secs"
## Prediction
## Reference F T
## F 141001 35092
## T 10657 4879
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.76126265 0.06683897 0.75934736 0.76317003 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## Prediction
## Reference F T
## F 147499 28594
## T 11394 4142
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.79132595 0.06926659 0.78949953 0.79314355 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 445.788000 secs"
## id
## 1 Max.cor.Y.Time.Lag##rcv#glmnet
## feats
## 1 ID,LocationDescription.my.fctr,Date.last2.log1p,Date.last4.log1p,Date.last8.log1p,Date.last16.log1p,Date.last32.log1p
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 20 215.736 5.03
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.6148796
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.1757976 0.9189267
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7593474 0.76317 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.6170827
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.1716109 0.7913259
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7894995 0.7931435 0.06926659
## max.AccuracySD.fit max.KappaSD.fit
## 1 7.220992e-06 0
## [1] "myfit_mdl: exit: 445.802000 secs"
if (length(glbFeatsText) > 0) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Txt.*"), major.inc = FALSE,
label.minor = "glmnet")
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.(?!([T|P]\\.))", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.nonTP",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.T\\.", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.onlyT",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
indepVars <- c(max_cor_y_x_vars)
for (txtFeat in names(glbFeatsText))
indepVars <- union(indepVars,
grep(paste(str_to_upper(substr(txtFeat, 1, 1)), "\\.P\\.", sep = ""),
names(glbObsAll), perl = TRUE, value = TRUE))
indepVars <- myadjustInteractionFeats(glb_feats_df, indepVars)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Max.cor.Y.Text.onlyP",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(setdiff(unique(glb_feats_df$cor.high.X), NA),
subset(glb_feats_df, nzv)$id)) > 0) {
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Interact.High.cor.Y"), major.inc = FALSE,
label.minor = "glmnet")
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Interact.High.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indepVar=c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":")),
rsp_var=glb_rsp_var,
fit_df=glbObsFit, OOB_df=glbObsOOB)
}
## label step_major step_minor label_minor
## 6 fit.models_0_Max.cor.Y.Time.Lag 1 5 glmnet
## 7 fit.models_0_Interact.High.cor.Y 1 6 glmnet
## bgn end elapsed
## 6 2016.029 2466.772 450.744
## 7 2466.773 NA NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: Interact.High.cor.Y##rcv#glmnet"
## [1] " indepVar: ID,LocationDescription.my.fctr,ID:ID,ID:.pos,ID:Date.month.fctr,ID:Date.year.fctr"
## [1] "myfit_mdl: setup complete: 0.733000 secs"
## + Fold1.Rep1: alpha=0.100, lambda=0.01069
## - Fold1.Rep1: alpha=0.100, lambda=0.01069
## + Fold1.Rep1: alpha=0.325, lambda=0.01069
## - Fold1.Rep1: alpha=0.325, lambda=0.01069
## + Fold1.Rep1: alpha=0.550, lambda=0.01069
## - Fold1.Rep1: alpha=0.550, lambda=0.01069
## + Fold1.Rep1: alpha=0.775, lambda=0.01069
## - Fold1.Rep1: alpha=0.775, lambda=0.01069
## + Fold1.Rep1: alpha=1.000, lambda=0.01069
## - Fold1.Rep1: alpha=1.000, lambda=0.01069
## + Fold2.Rep1: alpha=0.100, lambda=0.01069
## - Fold2.Rep1: alpha=0.100, lambda=0.01069
## + Fold2.Rep1: alpha=0.325, lambda=0.01069
## - Fold2.Rep1: alpha=0.325, lambda=0.01069
## + Fold2.Rep1: alpha=0.550, lambda=0.01069
## - Fold2.Rep1: alpha=0.550, lambda=0.01069
## + Fold2.Rep1: alpha=0.775, lambda=0.01069
## - Fold2.Rep1: alpha=0.775, lambda=0.01069
## + Fold2.Rep1: alpha=1.000, lambda=0.01069
## - Fold2.Rep1: alpha=1.000, lambda=0.01069
## + Fold3.Rep1: alpha=0.100, lambda=0.01069
## - Fold3.Rep1: alpha=0.100, lambda=0.01069
## + Fold3.Rep1: alpha=0.325, lambda=0.01069
## - Fold3.Rep1: alpha=0.325, lambda=0.01069
## + Fold3.Rep1: alpha=0.550, lambda=0.01069
## - Fold3.Rep1: alpha=0.550, lambda=0.01069
## + Fold3.Rep1: alpha=0.775, lambda=0.01069
## - Fold3.Rep1: alpha=0.775, lambda=0.01069
## + Fold3.Rep1: alpha=1.000, lambda=0.01069
## - Fold3.Rep1: alpha=1.000, lambda=0.01069
## + Fold1.Rep2: alpha=0.100, lambda=0.01069
## - Fold1.Rep2: alpha=0.100, lambda=0.01069
## + Fold1.Rep2: alpha=0.325, lambda=0.01069
## - Fold1.Rep2: alpha=0.325, lambda=0.01069
## + Fold1.Rep2: alpha=0.550, lambda=0.01069
## - Fold1.Rep2: alpha=0.550, lambda=0.01069
## + Fold1.Rep2: alpha=0.775, lambda=0.01069
## - Fold1.Rep2: alpha=0.775, lambda=0.01069
## + Fold1.Rep2: alpha=1.000, lambda=0.01069
## - Fold1.Rep2: alpha=1.000, lambda=0.01069
## + Fold2.Rep2: alpha=0.100, lambda=0.01069
## - Fold2.Rep2: alpha=0.100, lambda=0.01069
## + Fold2.Rep2: alpha=0.325, lambda=0.01069
## - Fold2.Rep2: alpha=0.325, lambda=0.01069
## + Fold2.Rep2: alpha=0.550, lambda=0.01069
## - Fold2.Rep2: alpha=0.550, lambda=0.01069
## + Fold2.Rep2: alpha=0.775, lambda=0.01069
## - Fold2.Rep2: alpha=0.775, lambda=0.01069
## + Fold2.Rep2: alpha=1.000, lambda=0.01069
## - Fold2.Rep2: alpha=1.000, lambda=0.01069
## + Fold3.Rep2: alpha=0.100, lambda=0.01069
## - Fold3.Rep2: alpha=0.100, lambda=0.01069
## + Fold3.Rep2: alpha=0.325, lambda=0.01069
## - Fold3.Rep2: alpha=0.325, lambda=0.01069
## + Fold3.Rep2: alpha=0.550, lambda=0.01069
## - Fold3.Rep2: alpha=0.550, lambda=0.01069
## + Fold3.Rep2: alpha=0.775, lambda=0.01069
## - Fold3.Rep2: alpha=0.775, lambda=0.01069
## + Fold3.Rep2: alpha=1.000, lambda=0.01069
## - Fold3.Rep2: alpha=1.000, lambda=0.01069
## + Fold1.Rep3: alpha=0.100, lambda=0.01069
## - Fold1.Rep3: alpha=0.100, lambda=0.01069
## + Fold1.Rep3: alpha=0.325, lambda=0.01069
## - Fold1.Rep3: alpha=0.325, lambda=0.01069
## + Fold1.Rep3: alpha=0.550, lambda=0.01069
## - Fold1.Rep3: alpha=0.550, lambda=0.01069
## + Fold1.Rep3: alpha=0.775, lambda=0.01069
## - Fold1.Rep3: alpha=0.775, lambda=0.01069
## + Fold1.Rep3: alpha=1.000, lambda=0.01069
## - Fold1.Rep3: alpha=1.000, lambda=0.01069
## + Fold2.Rep3: alpha=0.100, lambda=0.01069
## - Fold2.Rep3: alpha=0.100, lambda=0.01069
## + Fold2.Rep3: alpha=0.325, lambda=0.01069
## - Fold2.Rep3: alpha=0.325, lambda=0.01069
## + Fold2.Rep3: alpha=0.550, lambda=0.01069
## - Fold2.Rep3: alpha=0.550, lambda=0.01069
## + Fold2.Rep3: alpha=0.775, lambda=0.01069
## - Fold2.Rep3: alpha=0.775, lambda=0.01069
## + Fold2.Rep3: alpha=1.000, lambda=0.01069
## - Fold2.Rep3: alpha=1.000, lambda=0.01069
## + Fold3.Rep3: alpha=0.100, lambda=0.01069
## - Fold3.Rep3: alpha=0.100, lambda=0.01069
## + Fold3.Rep3: alpha=0.325, lambda=0.01069
## - Fold3.Rep3: alpha=0.325, lambda=0.01069
## + Fold3.Rep3: alpha=0.550, lambda=0.01069
## - Fold3.Rep3: alpha=0.550, lambda=0.01069
## + Fold3.Rep3: alpha=0.775, lambda=0.01069
## - Fold3.Rep3: alpha=0.775, lambda=0.01069
## + Fold3.Rep3: alpha=1.000, lambda=0.01069
## - Fold3.Rep3: alpha=1.000, lambda=0.01069
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0107 on full training set
## [1] "myfit_mdl: train complete: 332.632000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Interact.High.cor.Y", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Interact.High.cor.Y", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 54 -none- numeric
## beta 1998 dgCMatrix S4
## df 54 -none- numeric
## dim 2 -none- numeric
## lambda 54 -none- numeric
## dev.ratio 54 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 37 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -2.311916e+00
## ID
## -8.734485e-08
## LocationDescription.my.fctrALLEY
## 1.960841e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.563410e-01
## LocationDescription.my.fctrEntertainment
## 3.103673e-01
## LocationDescription.my.fctrGAS STATION
## 9.767587e-01
## LocationDescription.my.fctrGovernment
## 5.861516e-01
## LocationDescription.my.fctrOther
## 2.784585e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.549596e-01
## LocationDescription.my.fctrResidence
## 1.661066e-01
## LocationDescription.my.fctrSchool
## 5.467738e-01
## ID:.pos
## 7.025426e-13
## ID:Date.month.fctr05
## -1.305685e-08
## ID:Date.month.fctr06
## -1.334837e-09
## ID:Date.month.fctr09
## -1.041234e-09
## ID:Date.year.fctr2002
## 3.555876e-08
## ID:Date.year.fctr2003
## 1.856689e-08
## ID:Date.year.fctr2004
## 2.787150e-09
## ID:Date.year.fctr2006
## -3.091399e-09
## ID:Date.year.fctr2007
## 8.039766e-10
## ID:Date.year.fctr2010
## -2.513250e-08
## ID:Date.year.fctr2011
## -2.218880e-08
## ID:Date.year.fctr2012
## -1.026346e-08
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -2.311762e+00
## ID
## -8.809322e-08
## LocationDescription.my.fctrALLEY
## 2.091805e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.755954e-01
## LocationDescription.my.fctrEntertainment
## 3.336056e-01
## LocationDescription.my.fctrGAS STATION
## 9.900601e-01
## LocationDescription.my.fctrGovernment
## 6.118765e-01
## LocationDescription.my.fctrOther
## 2.882400e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.620808e-01
## LocationDescription.my.fctrResidence
## 1.776704e-01
## LocationDescription.my.fctrSchool
## 5.696339e-01
## ID:.pos
## 7.080145e-13
## ID:Date.month.fctr05
## -1.411892e-08
## ID:Date.month.fctr06
## -2.176373e-09
## ID:Date.month.fctr09
## -1.890424e-09
## ID:Date.month.fctr10
## -3.795720e-10
## ID:Date.year.fctr2002
## 3.627929e-08
## ID:Date.year.fctr2003
## 1.929884e-08
## ID:Date.year.fctr2004
## 3.477636e-09
## ID:Date.year.fctr2006
## -3.891133e-09
## ID:Date.year.fctr2007
## 1.746309e-09
## ID:Date.year.fctr2010
## -2.585715e-08
## ID:Date.year.fctr2011
## -2.282977e-08
## ID:Date.year.fctr2012
## -1.067269e-08
## [1] "myfit_mdl: train diagnostics complete: 333.251000 secs"
## Prediction
## Reference F T
## F 146720 29373
## T 11378 4158
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.78734430 0.06598801 0.78550529 0.78917460 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 560.069000 secs"
## id
## 1 Interact.High.cor.Y##rcv#glmnet
## feats
## 1 ID,LocationDescription.my.fctr,ID:ID,ID:.pos,ID:Date.month.fctr,ID:Date.year.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 331.147 8.347
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.6134441
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.1694825 0.9189267
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7855053 0.7891746 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.5522346
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.1499867 0.08107332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.07985499 0.08230435 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 7.220992e-06 0
## [1] "myfit_mdl: exit: 560.083000 secs"
# Low.cor.X
fit.models_0_chunk_df <- myadd_chunk(fit.models_0_chunk_df,
paste0("fit.models_0_", "Low.cor.X"), major.inc = FALSE,
label.minor = "glmnet")
## label step_major step_minor label_minor
## 7 fit.models_0_Interact.High.cor.Y 1 6 glmnet
## 8 fit.models_0_Low.cor.X 1 7 glmnet
## bgn end elapsed
## 7 2466.773 3034.139 567.366
## 8 3034.140 NA NA
indepVar <- mygetIndepVar(glb_feats_df)
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Low.cor.X",
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds, trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = "glmnet")),
indepVar = indepVar, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: Low.cor.X##rcv#glmnet"
## [1] " indepVar: .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID"
## [1] "myfit_mdl: setup complete: 0.733000 secs"
## + Fold1.Rep1: alpha=0.100, lambda=0.01069
## - Fold1.Rep1: alpha=0.100, lambda=0.01069
## + Fold1.Rep1: alpha=0.325, lambda=0.01069
## - Fold1.Rep1: alpha=0.325, lambda=0.01069
## + Fold1.Rep1: alpha=0.550, lambda=0.01069
## - Fold1.Rep1: alpha=0.550, lambda=0.01069
## + Fold1.Rep1: alpha=0.775, lambda=0.01069
## - Fold1.Rep1: alpha=0.775, lambda=0.01069
## + Fold1.Rep1: alpha=1.000, lambda=0.01069
## - Fold1.Rep1: alpha=1.000, lambda=0.01069
## + Fold2.Rep1: alpha=0.100, lambda=0.01069
## - Fold2.Rep1: alpha=0.100, lambda=0.01069
## + Fold2.Rep1: alpha=0.325, lambda=0.01069
## - Fold2.Rep1: alpha=0.325, lambda=0.01069
## + Fold2.Rep1: alpha=0.550, lambda=0.01069
## - Fold2.Rep1: alpha=0.550, lambda=0.01069
## + Fold2.Rep1: alpha=0.775, lambda=0.01069
## - Fold2.Rep1: alpha=0.775, lambda=0.01069
## + Fold2.Rep1: alpha=1.000, lambda=0.01069
## - Fold2.Rep1: alpha=1.000, lambda=0.01069
## + Fold3.Rep1: alpha=0.100, lambda=0.01069
## - Fold3.Rep1: alpha=0.100, lambda=0.01069
## + Fold3.Rep1: alpha=0.325, lambda=0.01069
## - Fold3.Rep1: alpha=0.325, lambda=0.01069
## + Fold3.Rep1: alpha=0.550, lambda=0.01069
## - Fold3.Rep1: alpha=0.550, lambda=0.01069
## + Fold3.Rep1: alpha=0.775, lambda=0.01069
## - Fold3.Rep1: alpha=0.775, lambda=0.01069
## + Fold3.Rep1: alpha=1.000, lambda=0.01069
## - Fold3.Rep1: alpha=1.000, lambda=0.01069
## + Fold1.Rep2: alpha=0.100, lambda=0.01069
## - Fold1.Rep2: alpha=0.100, lambda=0.01069
## + Fold1.Rep2: alpha=0.325, lambda=0.01069
## - Fold1.Rep2: alpha=0.325, lambda=0.01069
## + Fold1.Rep2: alpha=0.550, lambda=0.01069
## - Fold1.Rep2: alpha=0.550, lambda=0.01069
## + Fold1.Rep2: alpha=0.775, lambda=0.01069
## - Fold1.Rep2: alpha=0.775, lambda=0.01069
## + Fold1.Rep2: alpha=1.000, lambda=0.01069
## - Fold1.Rep2: alpha=1.000, lambda=0.01069
## + Fold2.Rep2: alpha=0.100, lambda=0.01069
## - Fold2.Rep2: alpha=0.100, lambda=0.01069
## + Fold2.Rep2: alpha=0.325, lambda=0.01069
## - Fold2.Rep2: alpha=0.325, lambda=0.01069
## + Fold2.Rep2: alpha=0.550, lambda=0.01069
## - Fold2.Rep2: alpha=0.550, lambda=0.01069
## + Fold2.Rep2: alpha=0.775, lambda=0.01069
## - Fold2.Rep2: alpha=0.775, lambda=0.01069
## + Fold2.Rep2: alpha=1.000, lambda=0.01069
## - Fold2.Rep2: alpha=1.000, lambda=0.01069
## + Fold3.Rep2: alpha=0.100, lambda=0.01069
## - Fold3.Rep2: alpha=0.100, lambda=0.01069
## + Fold3.Rep2: alpha=0.325, lambda=0.01069
## - Fold3.Rep2: alpha=0.325, lambda=0.01069
## + Fold3.Rep2: alpha=0.550, lambda=0.01069
## - Fold3.Rep2: alpha=0.550, lambda=0.01069
## + Fold3.Rep2: alpha=0.775, lambda=0.01069
## - Fold3.Rep2: alpha=0.775, lambda=0.01069
## + Fold3.Rep2: alpha=1.000, lambda=0.01069
## - Fold3.Rep2: alpha=1.000, lambda=0.01069
## + Fold1.Rep3: alpha=0.100, lambda=0.01069
## - Fold1.Rep3: alpha=0.100, lambda=0.01069
## + Fold1.Rep3: alpha=0.325, lambda=0.01069
## - Fold1.Rep3: alpha=0.325, lambda=0.01069
## + Fold1.Rep3: alpha=0.550, lambda=0.01069
## - Fold1.Rep3: alpha=0.550, lambda=0.01069
## + Fold1.Rep3: alpha=0.775, lambda=0.01069
## - Fold1.Rep3: alpha=0.775, lambda=0.01069
## + Fold1.Rep3: alpha=1.000, lambda=0.01069
## - Fold1.Rep3: alpha=1.000, lambda=0.01069
## + Fold2.Rep3: alpha=0.100, lambda=0.01069
## - Fold2.Rep3: alpha=0.100, lambda=0.01069
## + Fold2.Rep3: alpha=0.325, lambda=0.01069
## - Fold2.Rep3: alpha=0.325, lambda=0.01069
## + Fold2.Rep3: alpha=0.550, lambda=0.01069
## - Fold2.Rep3: alpha=0.550, lambda=0.01069
## + Fold2.Rep3: alpha=0.775, lambda=0.01069
## - Fold2.Rep3: alpha=0.775, lambda=0.01069
## + Fold2.Rep3: alpha=1.000, lambda=0.01069
## - Fold2.Rep3: alpha=1.000, lambda=0.01069
## + Fold3.Rep3: alpha=0.100, lambda=0.01069
## - Fold3.Rep3: alpha=0.100, lambda=0.01069
## + Fold3.Rep3: alpha=0.325, lambda=0.01069
## - Fold3.Rep3: alpha=0.325, lambda=0.01069
## + Fold3.Rep3: alpha=0.550, lambda=0.01069
## - Fold3.Rep3: alpha=0.550, lambda=0.01069
## + Fold3.Rep3: alpha=0.775, lambda=0.01069
## - Fold3.Rep3: alpha=0.775, lambda=0.01069
## + Fold3.Rep3: alpha=1.000, lambda=0.01069
## - Fold3.Rep3: alpha=1.000, lambda=0.01069
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0107 on full training set
## [1] "myfit_mdl: train complete: 619.375000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 59 -none- numeric
## beta 4189 dgCMatrix S4
## df 59 -none- numeric
## dim 2 -none- numeric
## lambda 59 -none- numeric
## dev.ratio 59 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 71 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## 3.457069e+01
## .pos
## 9.288549e-07
## .pos.y
## 9.483449e-07
## .rnorm
## -3.174615e-03
## Date.date.fctr(13,19]
## -1.357861e-02
## Date.day.minutes.poly.1
## 2.834088e+00
## Date.day.minutes.poly.2
## -2.985070e+01
## Date.day.minutes.poly.3
## -8.434349e+00
## Date.day.minutes.poly.5
## 6.293047e+00
## Date.last2.log1p
## 3.223433e-03
## Date.last32.log1p
## 6.644769e-02
## Date.minute.fctr(14.8,29.5]
## 2.985276e-01
## Date.minute.fctr(29.5,44.2]
## 5.300126e-02
## Date.minute.fctr(44.2,59.1]
## 3.123613e-01
## Date.month.fctr02
## 3.800956e-02
## Date.month.fctr05
## -3.011597e-02
## Date.month.fctr12
## 3.049606e-02
## Date.wkday.fctr4
## -9.609387e-03
## Date.wkend
## 3.747579e-02
## Date.year.fctr2002
## 5.772629e-02
## Date.year.fctr2003
## 6.357462e-02
## Date.year.fctr2004
## 5.326317e-02
## Date.year.fctr2005
## 3.570502e-02
## Date.year.fctr2007
## 6.819216e-02
## Date.year.fctr2010
## -1.952413e-01
## Date.year.fctr2011
## -2.339641e-01
## Date.year.fctr2012
## -2.015466e-01
## District.fctr10-19
## 1.800929e-02
## ID
## -3.441064e-08
## LocationDescription.my.fctrALLEY
## 1.611979e-01
## LocationDescription.my.fctrCommercialVehicle
## 7.837998e-01
## LocationDescription.my.fctrEntertainment
## 2.697141e-01
## LocationDescription.my.fctrGAS STATION
## 9.136771e-01
## LocationDescription.my.fctrGovernment
## 5.429015e-01
## LocationDescription.my.fctrOther
## 2.440159e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.004673e-01
## LocationDescription.my.fctrResidence
## 1.669185e-01
## LocationDescription.my.fctrSchool
## 4.610375e-01
## Year
## -1.887036e-02
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## 3.420387e+01
## .pos
## 9.190503e-07
## .pos.y
## 9.397726e-07
## .rnorm
## -4.415502e-03
## Date.date.fctr(13,19]
## -1.684284e-02
## Date.day.minutes.poly.1
## 3.635232e+00
## Date.day.minutes.poly.2
## -3.041163e+01
## Date.day.minutes.poly.3
## -9.343094e+00
## Date.day.minutes.poly.5
## 6.818306e+00
## Date.last16.log1p
## 1.566097e-04
## Date.last2.log1p
## 3.519131e-03
## Date.last32.log1p
## 6.809740e-02
## Date.minute.fctr(14.8,29.5]
## 3.059729e-01
## Date.minute.fctr(29.5,44.2]
## 5.756666e-02
## Date.minute.fctr(44.2,59.1]
## 3.197748e-01
## Date.month.fctr02
## 4.257876e-02
## Date.month.fctr05
## -3.417938e-02
## Date.month.fctr12
## 3.517850e-02
## Date.wkday.fctr4
## -1.248810e-02
## Date.wkend
## 4.013331e-02
## Date.year.fctr2002
## 6.405933e-02
## Date.year.fctr2003
## 6.875965e-02
## Date.year.fctr2004
## 5.821583e-02
## Date.year.fctr2005
## 4.065109e-02
## Date.year.fctr2007
## 7.351063e-02
## Date.year.fctr2010
## -2.042294e-01
## Date.year.fctr2011
## -2.444259e-01
## Date.year.fctr2012
## -2.127867e-01
## District.fctr10-19
## 2.173914e-02
## ID
## -3.475309e-08
## LocationDescription.my.fctrALLEY
## 1.729817e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.020796e-01
## LocationDescription.my.fctrEntertainment
## 2.912019e-01
## LocationDescription.my.fctrGAS STATION
## 9.247693e-01
## LocationDescription.my.fctrGovernment
## 5.662629e-01
## LocationDescription.my.fctrOther
## 2.529958e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.063057e-01
## LocationDescription.my.fctrResidence
## 1.784438e-01
## LocationDescription.my.fctrSchool
## 4.825700e-01
## Year
## -1.869923e-02
## [1] "myfit_mdl: train diagnostics complete: 620.043000 secs"
## Prediction
## Reference F T
## F 138793 37300
## T 10188 5348
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.75218782 0.07374411 0.75024836 0.75411965 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## Prediction
## Reference F T
## F 73925 102168
## T 3797 11739
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.44703046 0.04513121 0.44480261 0.44925992 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 861.877000 secs"
## id
## 1 Low.cor.X##rcv#glmnet
## feats
## 1 .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 615.615 16.334
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.6243901
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.1838306 0.9189267
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7502484 0.7541196 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.6249179
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.1813771 0.4470305
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4448026 0.4492599 0.04513121
## max.AccuracySD.fit max.KappaSD.fit
## 1 7.220992e-06 0
## [1] "myfit_mdl: exit: 861.892000 secs"
fit.models_0_chunk_df <-
myadd_chunk(fit.models_0_chunk_df, "fit.models_0_end", major.inc = FALSE,
label.minor = "teardown")
## label step_major step_minor label_minor bgn
## 8 fit.models_0_Low.cor.X 1 7 glmnet 3034.140
## 9 fit.models_0_end 1 8 teardown 3901.059
## end elapsed
## 8 3901.059 866.919
## 9 NA NA
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 16 fit.models 8 0 0 552.914 3901.072 3348.159
## 17 fit.models 8 1 1 3901.073 NA NA
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn", label.minor = "setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 setup 3978.044 NA NA
# refactor code for outliers / ensure all model runs exclude outliers in this chunk ???
#stop(here"); glb2Sav(); all.equal(glb_models_df, sav_models_df)
topindep_var <- NULL; interact_vars <- NULL;
for (mdl_id_pfx in names(glbMdlFamilies)) {
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, paste0("fit.models_1_", mdl_id_pfx),
major.inc = FALSE, label.minor = "setup")
indepVar <- NULL;
if (grepl("\\.Interact", mdl_id_pfx)) {
if (is.null(topindep_var) && is.null(interact_vars)) {
# select best glmnet model upto now
dsp_models_df <- orderBy(model_sel_frmla <- get_model_sel_frmla(),
glb_models_df)
dsp_models_df <- subset(dsp_models_df,
grepl(".glmnet", id, fixed = TRUE))
bst_mdl_id <- dsp_models_df$id[1]
mdl_id_pfx <-
paste(c(head(unlist(strsplit(bst_mdl_id, "[.]")), -1), "Interact"),
collapse=".")
# select important features
if (is.null(bst_featsimp_df <-
myget_feats_importance(glb_models_lst[[bst_mdl_id]]))) {
warning("Base model for RFE.Interact: ", bst_mdl_id,
" has no important features")
next
}
topindep_ix <- 1
while (is.null(topindep_var) && (topindep_ix <= nrow(bst_featsimp_df))) {
topindep_var <- row.names(bst_featsimp_df)[topindep_ix]
if (grepl(".fctr", topindep_var, fixed=TRUE))
topindep_var <-
paste0(unlist(strsplit(topindep_var, ".fctr"))[1], ".fctr")
if (topindep_var %in% names(glbFeatsInteractionOnly)) {
topindep_var <- NULL; topindep_ix <- topindep_ix + 1
} else break
}
# select features with importance > max(10, importance of .rnorm) & is not highest
# combine factor dummy features to just the factor feature
if (length(pos_rnorm <-
grep(".rnorm", row.names(bst_featsimp_df), fixed=TRUE)) > 0)
imp_rnorm <- bst_featsimp_df[pos_rnorm, 1] else
imp_rnorm <- NA
imp_cutoff <- max(10, imp_rnorm, na.rm=TRUE)
interact_vars <-
tail(row.names(subset(bst_featsimp_df,
imp > imp_cutoff)), -1)
if (length(interact_vars) > 0) {
interact_vars <-
myadjustInteractionFeats(glb_feats_df, myextract_actual_feats(interact_vars))
interact_vars <-
interact_vars[!grepl(topindep_var, interact_vars, fixed=TRUE)]
}
### bid0_sp only
# interact_vars <- c(
# "biddable", "D.ratio.sum.TfIdf.wrds.n", "D.TfIdf.sum.stem.stop.Ratio", "D.sum.TfIdf",
# "D.TfIdf.sum.post.stop", "D.TfIdf.sum.post.stem", "D.ratio.wrds.stop.n.wrds.n", "D.chrs.uppr.n.log",
# "D.chrs.n.log", "color.fctr"
# # , "condition.fctr", "prdl.my.descr.fctr"
# )
# interact_vars <- setdiff(interact_vars, c("startprice.dgt2.is9", "color.fctr"))
###
indepVar <- myextract_actual_feats(row.names(bst_featsimp_df))
indepVar <- setdiff(indepVar, topindep_var)
if (length(interact_vars) > 0) {
indepVar <-
setdiff(indepVar, myextract_actual_feats(interact_vars))
indepVar <- c(indepVar,
paste(topindep_var, setdiff(interact_vars, topindep_var),
sep = "*"))
} else indepVar <- union(indepVar, topindep_var)
}
}
if (is.null(indepVar))
indepVar <- glb_mdl_feats_lst[[mdl_id_pfx]]
if (is.null(indepVar) && grepl("RFE\\.", mdl_id_pfx))
indepVar <- myextract_actual_feats(predictors(rfe_fit_results))
if (is.null(indepVar))
indepVar <- mygetIndepVar(glb_feats_df)
if ((length(indepVar) == 1) && (grepl("^%<d-%", indepVar))) {
indepVar <-
eval(parse(text = str_trim(unlist(strsplit(indepVar, "%<d-%"))[2])))
}
indepVar <- myadjustInteractionFeats(glb_feats_df, indepVar)
if (grepl("\\.Interact", mdl_id_pfx)) {
# if (method != tail(unlist(strsplit(bst_mdl_id, "[.]")), 1)) next
if (is.null(glbMdlFamilies[[mdl_id_pfx]])) {
if (!is.null(glbMdlFamilies[["Best.Interact"]]))
glbMdlFamilies[[mdl_id_pfx]] <-
glbMdlFamilies[["Best.Interact"]]
}
}
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glbFeatsId] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
print(sprintf("Outliers removed: %d", nrow(glbObsFit) - nrow(fitobs_df)))
print(setdiff(glbObsFit[, glbFeatsId], fitobs_df[, glbFeatsId]))
} else fitobs_df <- glbObsFit
if (is.null(glbMdlFamilies[[mdl_id_pfx]]))
mdl_methods <- glbMdlMethods else
mdl_methods <- glbMdlFamilies[[mdl_id_pfx]]
for (method in mdl_methods) {
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indepVar <- setdiff(indepVar, c(".rnorm"))
#mdl_id <- paste0(mdl_id_pfx, ".no.rnorm")
}
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", mdl_id_pfx), major.inc = FALSE,
label.minor = method)
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type,
tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv", # or "none" if nominalWorkflow is crashing
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indepVar = indepVar, rsp_var = glb_rsp_var,
fit_df = fitobs_df, OOB_df = glbObsOOB)
# ntv_mdl <- glmnet(x = as.matrix(
# fitobs_df[, indepVar]),
# y = as.factor(as.character(
# fitobs_df[, glb_rsp_var])),
# family = "multinomial")
# bgn = 1; end = 100;
# ntv_mdl <- glmnet(x = as.matrix(
# subset(fitobs_df, pop.fctr != "crypto")[bgn:end, indepVar]),
# y = as.factor(as.character(
# subset(fitobs_df, pop.fctr != "crypto")[bgn:end, glb_rsp_var])),
# family = "multinomial")
}
}
## label step_major step_minor label_minor bgn end
## 1 fit.models_1_bgn 1 0 setup 3978.044 3978.056
## 2 fit.models_1_All.X 1 1 setup 3978.057 NA
## elapsed
## 1 0.012
## 2 NA
## label step_major step_minor label_minor bgn end
## 2 fit.models_1_All.X 1 1 setup 3978.057 3978.068
## 3 fit.models_1_All.X 1 2 glmnet 3978.069 NA
## elapsed
## 2 0.012
## 3 NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: All.X##rcv#glmnet"
## [1] " indepVar: .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID"
## [1] "myfit_mdl: setup complete: 0.957000 secs"
## + Fold1.Rep1: alpha=0.100, lambda=0.01069
## - Fold1.Rep1: alpha=0.100, lambda=0.01069
## + Fold1.Rep1: alpha=0.325, lambda=0.01069
## - Fold1.Rep1: alpha=0.325, lambda=0.01069
## + Fold1.Rep1: alpha=0.550, lambda=0.01069
## - Fold1.Rep1: alpha=0.550, lambda=0.01069
## + Fold1.Rep1: alpha=0.775, lambda=0.01069
## - Fold1.Rep1: alpha=0.775, lambda=0.01069
## + Fold1.Rep1: alpha=1.000, lambda=0.01069
## - Fold1.Rep1: alpha=1.000, lambda=0.01069
## + Fold2.Rep1: alpha=0.100, lambda=0.01069
## - Fold2.Rep1: alpha=0.100, lambda=0.01069
## + Fold2.Rep1: alpha=0.325, lambda=0.01069
## - Fold2.Rep1: alpha=0.325, lambda=0.01069
## + Fold2.Rep1: alpha=0.550, lambda=0.01069
## - Fold2.Rep1: alpha=0.550, lambda=0.01069
## + Fold2.Rep1: alpha=0.775, lambda=0.01069
## - Fold2.Rep1: alpha=0.775, lambda=0.01069
## + Fold2.Rep1: alpha=1.000, lambda=0.01069
## - Fold2.Rep1: alpha=1.000, lambda=0.01069
## + Fold3.Rep1: alpha=0.100, lambda=0.01069
## - Fold3.Rep1: alpha=0.100, lambda=0.01069
## + Fold3.Rep1: alpha=0.325, lambda=0.01069
## - Fold3.Rep1: alpha=0.325, lambda=0.01069
## + Fold3.Rep1: alpha=0.550, lambda=0.01069
## - Fold3.Rep1: alpha=0.550, lambda=0.01069
## + Fold3.Rep1: alpha=0.775, lambda=0.01069
## - Fold3.Rep1: alpha=0.775, lambda=0.01069
## + Fold3.Rep1: alpha=1.000, lambda=0.01069
## - Fold3.Rep1: alpha=1.000, lambda=0.01069
## + Fold1.Rep2: alpha=0.100, lambda=0.01069
## - Fold1.Rep2: alpha=0.100, lambda=0.01069
## + Fold1.Rep2: alpha=0.325, lambda=0.01069
## - Fold1.Rep2: alpha=0.325, lambda=0.01069
## + Fold1.Rep2: alpha=0.550, lambda=0.01069
## - Fold1.Rep2: alpha=0.550, lambda=0.01069
## + Fold1.Rep2: alpha=0.775, lambda=0.01069
## - Fold1.Rep2: alpha=0.775, lambda=0.01069
## + Fold1.Rep2: alpha=1.000, lambda=0.01069
## - Fold1.Rep2: alpha=1.000, lambda=0.01069
## + Fold2.Rep2: alpha=0.100, lambda=0.01069
## - Fold2.Rep2: alpha=0.100, lambda=0.01069
## + Fold2.Rep2: alpha=0.325, lambda=0.01069
## - Fold2.Rep2: alpha=0.325, lambda=0.01069
## + Fold2.Rep2: alpha=0.550, lambda=0.01069
## - Fold2.Rep2: alpha=0.550, lambda=0.01069
## + Fold2.Rep2: alpha=0.775, lambda=0.01069
## - Fold2.Rep2: alpha=0.775, lambda=0.01069
## + Fold2.Rep2: alpha=1.000, lambda=0.01069
## - Fold2.Rep2: alpha=1.000, lambda=0.01069
## + Fold3.Rep2: alpha=0.100, lambda=0.01069
## - Fold3.Rep2: alpha=0.100, lambda=0.01069
## + Fold3.Rep2: alpha=0.325, lambda=0.01069
## - Fold3.Rep2: alpha=0.325, lambda=0.01069
## + Fold3.Rep2: alpha=0.550, lambda=0.01069
## - Fold3.Rep2: alpha=0.550, lambda=0.01069
## + Fold3.Rep2: alpha=0.775, lambda=0.01069
## - Fold3.Rep2: alpha=0.775, lambda=0.01069
## + Fold3.Rep2: alpha=1.000, lambda=0.01069
## - Fold3.Rep2: alpha=1.000, lambda=0.01069
## + Fold1.Rep3: alpha=0.100, lambda=0.01069
## - Fold1.Rep3: alpha=0.100, lambda=0.01069
## + Fold1.Rep3: alpha=0.325, lambda=0.01069
## - Fold1.Rep3: alpha=0.325, lambda=0.01069
## + Fold1.Rep3: alpha=0.550, lambda=0.01069
## - Fold1.Rep3: alpha=0.550, lambda=0.01069
## + Fold1.Rep3: alpha=0.775, lambda=0.01069
## - Fold1.Rep3: alpha=0.775, lambda=0.01069
## + Fold1.Rep3: alpha=1.000, lambda=0.01069
## - Fold1.Rep3: alpha=1.000, lambda=0.01069
## + Fold2.Rep3: alpha=0.100, lambda=0.01069
## - Fold2.Rep3: alpha=0.100, lambda=0.01069
## + Fold2.Rep3: alpha=0.325, lambda=0.01069
## - Fold2.Rep3: alpha=0.325, lambda=0.01069
## + Fold2.Rep3: alpha=0.550, lambda=0.01069
## - Fold2.Rep3: alpha=0.550, lambda=0.01069
## + Fold2.Rep3: alpha=0.775, lambda=0.01069
## - Fold2.Rep3: alpha=0.775, lambda=0.01069
## + Fold2.Rep3: alpha=1.000, lambda=0.01069
## - Fold2.Rep3: alpha=1.000, lambda=0.01069
## + Fold3.Rep3: alpha=0.100, lambda=0.01069
## - Fold3.Rep3: alpha=0.100, lambda=0.01069
## + Fold3.Rep3: alpha=0.325, lambda=0.01069
## - Fold3.Rep3: alpha=0.325, lambda=0.01069
## + Fold3.Rep3: alpha=0.550, lambda=0.01069
## - Fold3.Rep3: alpha=0.550, lambda=0.01069
## + Fold3.Rep3: alpha=0.775, lambda=0.01069
## - Fold3.Rep3: alpha=0.775, lambda=0.01069
## + Fold3.Rep3: alpha=1.000, lambda=0.01069
## - Fold3.Rep3: alpha=1.000, lambda=0.01069
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0107 on full training set
## [1] "myfit_mdl: train complete: 606.983000 secs"
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 59 -none- numeric
## beta 4189 dgCMatrix S4
## df 59 -none- numeric
## dim 2 -none- numeric
## lambda 59 -none- numeric
## dev.ratio 59 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 71 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## 3.457069e+01
## .pos
## 9.288549e-07
## .pos.y
## 9.483449e-07
## .rnorm
## -3.174615e-03
## Date.date.fctr(13,19]
## -1.357861e-02
## Date.day.minutes.poly.1
## 2.834088e+00
## Date.day.minutes.poly.2
## -2.985070e+01
## Date.day.minutes.poly.3
## -8.434349e+00
## Date.day.minutes.poly.5
## 6.293047e+00
## Date.last2.log1p
## 3.223433e-03
## Date.last32.log1p
## 6.644769e-02
## Date.minute.fctr(14.8,29.5]
## 2.985276e-01
## Date.minute.fctr(29.5,44.2]
## 5.300126e-02
## Date.minute.fctr(44.2,59.1]
## 3.123613e-01
## Date.month.fctr02
## 3.800956e-02
## Date.month.fctr05
## -3.011597e-02
## Date.month.fctr12
## 3.049606e-02
## Date.wkday.fctr4
## -9.609387e-03
## Date.wkend
## 3.747579e-02
## Date.year.fctr2002
## 5.772629e-02
## Date.year.fctr2003
## 6.357462e-02
## Date.year.fctr2004
## 5.326317e-02
## Date.year.fctr2005
## 3.570502e-02
## Date.year.fctr2007
## 6.819216e-02
## Date.year.fctr2010
## -1.952413e-01
## Date.year.fctr2011
## -2.339641e-01
## Date.year.fctr2012
## -2.015466e-01
## District.fctr10-19
## 1.800929e-02
## ID
## -3.441064e-08
## LocationDescription.my.fctrALLEY
## 1.611979e-01
## LocationDescription.my.fctrCommercialVehicle
## 7.837998e-01
## LocationDescription.my.fctrEntertainment
## 2.697141e-01
## LocationDescription.my.fctrGAS STATION
## 9.136771e-01
## LocationDescription.my.fctrGovernment
## 5.429015e-01
## LocationDescription.my.fctrOther
## 2.440159e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.004673e-01
## LocationDescription.my.fctrResidence
## 1.669185e-01
## LocationDescription.my.fctrSchool
## 4.610375e-01
## Year
## -1.887036e-02
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## 3.420387e+01
## .pos
## 9.190503e-07
## .pos.y
## 9.397726e-07
## .rnorm
## -4.415502e-03
## Date.date.fctr(13,19]
## -1.684284e-02
## Date.day.minutes.poly.1
## 3.635232e+00
## Date.day.minutes.poly.2
## -3.041163e+01
## Date.day.minutes.poly.3
## -9.343094e+00
## Date.day.minutes.poly.5
## 6.818306e+00
## Date.last16.log1p
## 1.566097e-04
## Date.last2.log1p
## 3.519131e-03
## Date.last32.log1p
## 6.809740e-02
## Date.minute.fctr(14.8,29.5]
## 3.059729e-01
## Date.minute.fctr(29.5,44.2]
## 5.756666e-02
## Date.minute.fctr(44.2,59.1]
## 3.197748e-01
## Date.month.fctr02
## 4.257876e-02
## Date.month.fctr05
## -3.417938e-02
## Date.month.fctr12
## 3.517850e-02
## Date.wkday.fctr4
## -1.248810e-02
## Date.wkend
## 4.013331e-02
## Date.year.fctr2002
## 6.405933e-02
## Date.year.fctr2003
## 6.875965e-02
## Date.year.fctr2004
## 5.821583e-02
## Date.year.fctr2005
## 4.065109e-02
## Date.year.fctr2007
## 7.351063e-02
## Date.year.fctr2010
## -2.042294e-01
## Date.year.fctr2011
## -2.444259e-01
## Date.year.fctr2012
## -2.127867e-01
## District.fctr10-19
## 2.173914e-02
## ID
## -3.475309e-08
## LocationDescription.my.fctrALLEY
## 1.729817e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.020796e-01
## LocationDescription.my.fctrEntertainment
## 2.912019e-01
## LocationDescription.my.fctrGAS STATION
## 9.247693e-01
## LocationDescription.my.fctrGovernment
## 5.662629e-01
## LocationDescription.my.fctrOther
## 2.529958e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.063057e-01
## LocationDescription.my.fctrResidence
## 1.784438e-01
## LocationDescription.my.fctrSchool
## 4.825700e-01
## Year
## -1.869923e-02
## [1] "myfit_mdl: train diagnostics complete: 607.925000 secs"
## Prediction
## Reference F T
## F 138793 37300
## T 10188 5348
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.75218782 0.07374411 0.75024836 0.75411965 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## Prediction
## Reference F T
## F 73925 102168
## T 3797 11739
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.44703046 0.04513121 0.44480261 0.44925992 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 847.820000 secs"
## id
## 1 All.X##rcv#glmnet
## feats
## 1 .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 604.994 16.301
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.6243901
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.1838306 0.9189267
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7502484 0.7541196 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.6249179
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.1813771 0.4470305
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.4448026 0.4492599 0.04513121
## max.AccuracySD.fit max.KappaSD.fit
## 1 7.220992e-06 0
## [1] "myfit_mdl: exit: 847.835000 secs"
## label step_major step_minor label_minor bgn end
## 3 fit.models_1_All.X 1 2 glmnet 3978.069 4825.91
## 4 fit.models_1_All.X 1 3 glm 4825.911 NA
## elapsed
## 3 847.841
## 4 NA
## [1] "myfit_mdl: enter: 0.000000 secs"
## [1] "fitting model: All.X##rcv#glm"
## [1] " indepVar: .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID"
## [1] "myfit_mdl: setup complete: 4.879000 secs"
## + Fold1.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep1: parameter=none
## + Fold2.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep1: parameter=none
## + Fold3.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep1: parameter=none
## + Fold1.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep2: parameter=none
## + Fold2.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep2: parameter=none
## + Fold3.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep2: parameter=none
## + Fold1.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep3: parameter=none
## + Fold2.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep3: parameter=none
## + Fold3.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep3: parameter=none
## Aggregating results
## Fitting final model on full training set
## [1] "myfit_mdl: train complete: 160.979000 secs"
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9768 -0.4488 -0.3942 -0.3028 2.7314
##
## Coefficients: (3 not defined because of singularities)
## Estimate
## (Intercept) 5.728e+00
## .pos -3.975e-05
## .pos.y NA
## .rnorm -1.902e-02
## Beat -2.488e-05
## `Date.date.fctr(7,13]` 4.222e-03
## `Date.date.fctr(13,19]` -6.511e-02
## `Date.date.fctr(19,25]` -1.077e-02
## `Date.date.fctr(25,31]` -2.910e-02
## Date.day.minutes.poly.1 3.657e+01
## Date.day.minutes.poly.2 -4.238e+01
## Date.day.minutes.poly.3 -2.273e+01
## Date.day.minutes.poly.4 4.325e+00
## Date.day.minutes.poly.5 1.255e+01
## `Date.hour.fctr(7.67,15.3]` -7.257e-02
## `Date.hour.fctr(15.3,23]` -1.025e-01
## Date.juliandate 1.369e-03
## Date.last16.log1p 5.051e-02
## Date.last2.log1p 1.207e-02
## Date.last32.log1p 6.251e-02
## Date.last4.log1p -2.639e-02
## Date.last8.log1p -1.640e-02
## `Date.minute.fctr(14.8,29.5]` 3.766e-01
## `Date.minute.fctr(29.5,44.2]` 1.043e-01
## `Date.minute.fctr(44.2,59.1]` 3.939e-01
## Date.month.fctr02 -6.165e-02
## Date.month.fctr03 -2.084e-01
## Date.month.fctr04 -2.595e-01
## Date.month.fctr05 -3.992e-01
## Date.month.fctr06 -4.134e-01
## Date.month.fctr07 -4.457e-01
## Date.month.fctr08 -4.862e-01
## Date.month.fctr09 -5.754e-01
## Date.month.fctr10 -6.204e-01
## Date.month.fctr11 -6.707e-01
## Date.month.fctr12 -6.381e-01
## Date.wkday.fctr1 -1.128e-01
## Date.wkday.fctr2 -1.258e-01
## Date.wkday.fctr3 -9.348e-02
## Date.wkday.fctr4 -1.497e-01
## Date.wkday.fctr5 -8.658e-02
## Date.wkday.fctr6 -6.604e-02
## Date.wkend NA
## Date.year.fctr2002 -2.303e-01
## Date.year.fctr2003 -4.994e-01
## Date.year.fctr2004 -7.643e-01
## Date.year.fctr2005 -9.364e-01
## Date.year.fctr2006 -1.164e+00
## Date.year.fctr2007 -1.100e+00
## Date.year.fctr2008 -1.282e+00
## Date.year.fctr2009 -1.322e+00
## Date.year.fctr2010 -1.803e+00
## Date.year.fctr2011 -2.128e+00
## Date.year.fctr2012 -2.318e+00
## `District.fctr1-9` -4.837e-02
## `District.fctr10-19` 3.938e-02
## `District.fctr20+` 1.526e-02
## ID -8.044e-07
## LocationDescription.my.fctrALLEY 3.028e-01
## LocationDescription.my.fctrCommercialVehicle 1.030e+00
## LocationDescription.my.fctrEntertainment 5.196e-01
## `LocationDescription.my.fctrGAS STATION` 1.041e+00
## LocationDescription.my.fctrGovernment 8.092e-01
## LocationDescription.my.fctrOther 3.668e-01
## `LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)` 2.741e-01
## LocationDescription.my.fctrResidence 3.158e-01
## LocationDescription.my.fctrSchool 7.050e-01
## LocationDescription.my.fctrSidewalk 1.761e-01
## `LocationDescription.my.fctrVACANT LOT/LAND` -1.407e-01
## `LocationDescription.my.fctrVEHICLE NON-COMMERCIAL` 2.900e-01
## LocationDescription.my.fctrcha -2.665e-01
## Year NA
## Std. Error
## (Intercept) 2.264e+00
## .pos 1.197e-05
## .pos.y NA
## .rnorm 8.477e-03
## Beat 1.754e-05
## `Date.date.fctr(7,13]` 4.038e-02
## `Date.date.fctr(13,19]` 6.497e-02
## `Date.date.fctr(19,25]` 9.174e-02
## `Date.date.fctr(25,31]` 1.186e-01
## Date.day.minutes.poly.1 1.856e+01
## Date.day.minutes.poly.2 1.026e+01
## Date.day.minutes.poly.3 6.318e+00
## Date.day.minutes.poly.4 7.650e+00
## Date.day.minutes.poly.5 5.570e+00
## `Date.hour.fctr(7.67,15.3]` 5.635e-02
## `Date.hour.fctr(15.3,23]` 7.432e-02
## Date.juliandate 4.802e-03
## Date.last16.log1p 3.277e-02
## Date.last2.log1p 4.129e-03
## Date.last32.log1p 3.687e-02
## Date.last4.log1p 9.289e-03
## Date.last8.log1p 2.063e-02
## `Date.minute.fctr(14.8,29.5]` 3.308e-02
## `Date.minute.fctr(29.5,44.2]` 2.100e-02
## `Date.minute.fctr(44.2,59.1]` 3.256e-02
## Date.month.fctr02 1.524e-01
## Date.month.fctr03 2.851e-01
## Date.month.fctr04 4.313e-01
## Date.month.fctr05 5.744e-01
## Date.month.fctr06 7.213e-01
## Date.month.fctr07 8.644e-01
## Date.month.fctr08 1.012e+00
## Date.month.fctr09 1.159e+00
## Date.month.fctr10 1.303e+00
## Date.month.fctr11 1.450e+00
## Date.month.fctr12 1.593e+00
## Date.wkday.fctr1 3.152e-02
## Date.wkday.fctr2 3.177e-02
## Date.wkday.fctr3 3.142e-02
## Date.wkday.fctr4 3.182e-02
## Date.wkday.fctr5 3.093e-02
## Date.wkday.fctr6 3.119e-02
## Date.wkend NA
## Date.year.fctr2002 2.816e-01
## Date.year.fctr2003 5.315e-01
## Date.year.fctr2004 7.733e-01
## Date.year.fctr2005 1.022e+00
## Date.year.fctr2006 1.276e+00
## Date.year.fctr2007 1.516e+00
## Date.year.fctr2008 1.745e+00
## Date.year.fctr2009 1.953e+00
## Date.year.fctr2010 2.159e+00
## Date.year.fctr2011 2.390e+00
## Date.year.fctr2012 2.602e+00
## `District.fctr1-9` 5.535e-02
## `District.fctr10-19` 5.509e-02
## `District.fctr20+` 6.171e-02
## ID 1.861e-07
## LocationDescription.my.fctrALLEY 6.834e-02
## LocationDescription.my.fctrCommercialVehicle 1.014e-01
## LocationDescription.my.fctrEntertainment 1.218e-01
## `LocationDescription.my.fctrGAS STATION` 5.545e-02
## LocationDescription.my.fctrGovernment 1.509e-01
## LocationDescription.my.fctrOther 4.526e-02
## `LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)` 2.877e-02
## LocationDescription.my.fctrResidence 4.689e-02
## LocationDescription.my.fctrSchool 1.345e-01
## LocationDescription.my.fctrSidewalk 1.614e-01
## `LocationDescription.my.fctrVACANT LOT/LAND` 1.276e-01
## `LocationDescription.my.fctrVEHICLE NON-COMMERCIAL` 1.371e-01
## LocationDescription.my.fctrcha 1.936e-01
## Year NA
## z value
## (Intercept) 2.530
## .pos -3.321
## .pos.y NA
## .rnorm -2.243
## Beat -1.418
## `Date.date.fctr(7,13]` 0.105
## `Date.date.fctr(13,19]` -1.002
## `Date.date.fctr(19,25]` -0.117
## `Date.date.fctr(25,31]` -0.245
## Date.day.minutes.poly.1 1.971
## Date.day.minutes.poly.2 -4.132
## Date.day.minutes.poly.3 -3.597
## Date.day.minutes.poly.4 0.565
## Date.day.minutes.poly.5 2.253
## `Date.hour.fctr(7.67,15.3]` -1.288
## `Date.hour.fctr(15.3,23]` -1.379
## Date.juliandate 0.285
## Date.last16.log1p 1.541
## Date.last2.log1p 2.922
## Date.last32.log1p 1.695
## Date.last4.log1p -2.841
## Date.last8.log1p -0.795
## `Date.minute.fctr(14.8,29.5]` 11.386
## `Date.minute.fctr(29.5,44.2]` 4.967
## `Date.minute.fctr(44.2,59.1]` 12.097
## Date.month.fctr02 -0.404
## Date.month.fctr03 -0.731
## Date.month.fctr04 -0.602
## Date.month.fctr05 -0.695
## Date.month.fctr06 -0.573
## Date.month.fctr07 -0.516
## Date.month.fctr08 -0.480
## Date.month.fctr09 -0.496
## Date.month.fctr10 -0.476
## Date.month.fctr11 -0.463
## Date.month.fctr12 -0.401
## Date.wkday.fctr1 -3.580
## Date.wkday.fctr2 -3.959
## Date.wkday.fctr3 -2.975
## Date.wkday.fctr4 -4.704
## Date.wkday.fctr5 -2.799
## Date.wkday.fctr6 -2.117
## Date.wkend NA
## Date.year.fctr2002 -0.818
## Date.year.fctr2003 -0.940
## Date.year.fctr2004 -0.988
## Date.year.fctr2005 -0.916
## Date.year.fctr2006 -0.912
## Date.year.fctr2007 -0.726
## Date.year.fctr2008 -0.735
## Date.year.fctr2009 -0.677
## Date.year.fctr2010 -0.835
## Date.year.fctr2011 -0.891
## Date.year.fctr2012 -0.891
## `District.fctr1-9` -0.874
## `District.fctr10-19` 0.715
## `District.fctr20+` 0.247
## ID -4.322
## LocationDescription.my.fctrALLEY 4.431
## LocationDescription.my.fctrCommercialVehicle 10.156
## LocationDescription.my.fctrEntertainment 4.267
## `LocationDescription.my.fctrGAS STATION` 18.772
## LocationDescription.my.fctrGovernment 5.362
## LocationDescription.my.fctrOther 8.103
## `LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)` 9.528
## LocationDescription.my.fctrResidence 6.735
## LocationDescription.my.fctrSchool 5.240
## LocationDescription.my.fctrSidewalk 1.091
## `LocationDescription.my.fctrVACANT LOT/LAND` -1.102
## `LocationDescription.my.fctrVEHICLE NON-COMMERCIAL` 2.114
## LocationDescription.my.fctrcha -1.377
## Year NA
## Pr(>|z|)
## (Intercept) 0.011409 *
## .pos 0.000897 ***
## .pos.y NA
## .rnorm 0.024884 *
## Beat 0.156120
## `Date.date.fctr(7,13]` 0.916719
## `Date.date.fctr(13,19]` 0.316216
## `Date.date.fctr(19,25]` 0.906570
## `Date.date.fctr(25,31]` 0.806218
## Date.day.minutes.poly.1 0.048734 *
## Date.day.minutes.poly.2 3.59e-05 ***
## Date.day.minutes.poly.3 0.000321 ***
## Date.day.minutes.poly.4 0.571839
## Date.day.minutes.poly.5 0.024282 *
## `Date.hour.fctr(7.67,15.3]` 0.197816
## `Date.hour.fctr(15.3,23]` 0.167906
## Date.juliandate 0.775618
## Date.last16.log1p 0.123282
## Date.last2.log1p 0.003474 **
## Date.last32.log1p 0.090009 .
## Date.last4.log1p 0.004491 **
## Date.last8.log1p 0.426718
## `Date.minute.fctr(14.8,29.5]` < 2e-16 ***
## `Date.minute.fctr(29.5,44.2]` 6.81e-07 ***
## `Date.minute.fctr(44.2,59.1]` < 2e-16 ***
## Date.month.fctr02 0.685887
## Date.month.fctr03 0.464639
## Date.month.fctr04 0.547351
## Date.month.fctr05 0.487034
## Date.month.fctr06 0.566503
## Date.month.fctr07 0.606132
## Date.month.fctr08 0.631034
## Date.month.fctr09 0.619679
## Date.month.fctr10 0.633935
## Date.month.fctr11 0.643674
## Date.month.fctr12 0.688761
## Date.wkday.fctr1 0.000343 ***
## Date.wkday.fctr2 7.53e-05 ***
## Date.wkday.fctr3 0.002926 **
## Date.wkday.fctr4 2.55e-06 ***
## Date.wkday.fctr5 0.005128 **
## Date.wkday.fctr6 0.034244 *
## Date.wkend NA
## Date.year.fctr2002 0.413441
## Date.year.fctr2003 0.347392
## Date.year.fctr2004 0.322933
## Date.year.fctr2005 0.359500
## Date.year.fctr2006 0.361668
## Date.year.fctr2007 0.468113
## Date.year.fctr2008 0.462581
## Date.year.fctr2009 0.498405
## Date.year.fctr2010 0.403592
## Date.year.fctr2011 0.373153
## Date.year.fctr2012 0.372840
## `District.fctr1-9` 0.382120
## `District.fctr10-19` 0.474720
## `District.fctr20+` 0.804728
## ID 1.55e-05 ***
## LocationDescription.my.fctrALLEY 9.36e-06 ***
## LocationDescription.my.fctrCommercialVehicle < 2e-16 ***
## LocationDescription.my.fctrEntertainment 1.98e-05 ***
## `LocationDescription.my.fctrGAS STATION` < 2e-16 ***
## LocationDescription.my.fctrGovernment 8.21e-08 ***
## LocationDescription.my.fctrOther 5.36e-16 ***
## `LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)` < 2e-16 ***
## LocationDescription.my.fctrResidence 1.64e-11 ***
## LocationDescription.my.fctrSchool 1.61e-07 ***
## LocationDescription.my.fctrSidewalk 0.275245
## `LocationDescription.my.fctrVACANT LOT/LAND` 0.270406
## `LocationDescription.my.fctrVEHICLE NON-COMMERCIAL` 0.034475 *
## LocationDescription.my.fctrcha 0.168533
## Year NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 107842 on 191628 degrees of freedom
## Residual deviance: 104747 on 191560 degrees of freedom
## AIC: 104885
##
## Number of Fisher Scoring iterations: 6
##
## [1] "myfit_mdl: train diagnostics complete: 241.207000 secs"
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Prediction
## Reference F T
## F 135869 40224
## T 9733 5803
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.73930355 0.07657829 0.73733172 0.74126813 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Prediction
## Reference F T
## F 0 176093
## T 0 15536
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.08107332 0.00000000 0.07985499 0.08230435 0.91892668
## AccuracyPValue McnemarPValue
## 1.00000000 0.00000000
## [1] "myfit_mdl: predict complete: 499.237000 secs"
## id
## 1 All.X##rcv#glm
## feats
## 1 .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 155.215 16.581
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.6283329
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.1885223 0.9189267
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7373317 0.7412681 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.6259209
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0 0.1499867 0.08107332
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.07985499 0.08230435 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 7.220992e-06 0
## [1] "myfit_mdl: exit: 499.252000 secs"
# Check if other preProcess methods improve model performance
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_preProc", major.inc = FALSE,
label.minor = "preProc")
## label step_major step_minor label_minor bgn end
## 4 fit.models_1_All.X 1 3 glm 4825.911 5330.678
## 5 fit.models_1_preProc 1 4 preProc 5330.678 NA
## elapsed
## 4 504.767
## 5 NA
mdl_id <- orderBy(get_model_sel_frmla(), glb_models_df)[1, "id"]
indepVar <- trim(unlist(strsplit(glb_models_df[glb_models_df$id == mdl_id,
"feats"], "[,]")))
method <- tail(unlist(strsplit(mdl_id, "[.]")), 1)
mdl_id_pfx <- paste0(head(unlist(strsplit(mdl_id, "[.]")), -1), collapse = ".")
if (!is.null(glbObsFitOutliers[[mdl_id_pfx]])) {
fitobs_df <- glbObsFit[!(glbObsFit[, glbFeatsId] %in%
glbObsFitOutliers[[mdl_id_pfx]]), ]
print(sprintf("Outliers removed: %d", nrow(glbObsFit) - nrow(fitobs_df)))
print(setdiff(glbObsFit[, glbFeatsId], fitobs_df[, glbFeatsId]))
} else fitobs_df <- glbObsFit
for (prePr in glb_preproc_methods) {
# The operations are applied in this order:
# Box-Cox/Yeo-Johnson transformation, centering, scaling, range, imputation, PCA, ICA then spatial sign.
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix=mdl_id_pfx,
type=glb_model_type, tune.df=glbMdlTuneParams,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds,
trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method=method, train.preProcess=prePr)),
indepVar=indepVar, rsp_var=glb_rsp_var,
fit_df=fitobs_df, OOB_df=glbObsOOB)
}
# If (All|RFE).X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indepVar
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(mdl_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# orig_glm <- glb_models_lst[["RFE.X.glm"]]$finalModel; print(summary(orig_glm))
# require(car)
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# # if vif errors out with "there are aliased coefficients in the model"
# alias_orig_glm <- alias(orig_glm); alias_complete_orig_glm <- (alias_orig_glm$Complete > 0); alias_complete_orig_glm <- alias_complete_orig_glm[rowSums(alias_complete_orig_glm) > 0, colSums(alias_complete_orig_glm) > 0]; print(alias_complete_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glbObsFit[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE), ]
# all.equal(glbObsAll$S.chrs.uppr.n.log, glbObsAll$A.chrs.uppr.n.log)
# cor(glbObsAll$S.T.herald, glbObsAll$S.T.tribun)
# mydspObs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glbObsAll[, setdiff(names(glbObsAll), myfind_chr_cols_df(glbObsAll))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(mdl_id=paste0(mdl_id_pfx, ".cp.0"), model_method=method,
# indepVar=indepVar,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
# User specified
# Ensure at least 2 vars in each regression; else varImp crashes
# sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df; sav_featsimp_df <- glb_featsimp_df; all.equal(sav_featsimp_df, glb_featsimp_df)
# glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df; glm_featsimp_df <- sav_featsimp_df
# easier to exclude features
# require(gdata) # needed for trim
# mdl_id <- "";
# indepVar <- head(subset(glb_models_df, grepl("All\\.X\\.", mdl_id), select=feats)
# , 1)[, "feats"]
# indepVar <- trim(unlist(strsplit(indepVar, "[,]")))
# indepVar <- setdiff(indepVar, ".rnorm")
# easier to include features
#stop(here"); sav_models_df <- glb_models_df; glb_models_df <- sav_models_df
# !_sp
# mdl_id <- "csm"; indepVar <- c(NULL
# ,"prdline.my.fctr", "prdline.my.fctr:.clusterid.fctr"
# ,"prdline.my.fctr*biddable"
# #,"prdline.my.fctr*startprice.log"
# #,"prdline.my.fctr*startprice.diff"
# ,"prdline.my.fctr*condition.fctr"
# ,"prdline.my.fctr*D.terms.post.stop.n"
# #,"prdline.my.fctr*D.terms.post.stem.n"
# ,"prdline.my.fctr*cellular.fctr"
# # ,"<feat1>:<feat2>"
# )
# for (method in glbMdlMethods) {
# ret_lst <- myfit_mdl(mdl_id=mdl_id, model_method=method,
# indepVar=indepVar,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glbMdlTuneParams)
# csm_mdl_id <- paste0(mdl_id, ".", method)
# csm_featsimp_df <- myget_feats_importance(glb_models_lst[[paste0(mdl_id, ".",
# method)]]); print(head(csm_featsimp_df))
# }
###
# Ntv.1.lm <- lm(reformulate(indepVar, glb_rsp_var), glbObsTrn); print(summary(Ntv.1.lm))
#glb_models_df[, "max.Accuracy.OOB", FALSE]
#varImp(glb_models_lst[["Low.cor.X.glm"]])
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.2.glm"]])$imp)
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.3.glm"]])$imp)
#glb_feats_df[grepl("npnct28", glb_feats_df$id), ]
# User specified bivariate models
# indepVar_lst <- list()
# for (feat in setdiff(names(glbObsFit),
# union(glb_rsp_var, glbFeatsExclude)))
# indepVar_lst[["feat"]] <- feat
# User specified combinatorial models
# indepVar_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indepVar_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(mdl_id=paste0(mdl_id_pfx, ""), model_method=method,
# indepVar=indepVar,
# rsp_var=glb_rsp_var,
# fit_df=glbObsFit, OOB_df=glbObsOOB,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glbMdlTuneParams,
# model_loss_mtrx=glbMdlMetric_terms,
# model_summaryFunction=glbMdlMetricSummaryFn,
# model_metric=glbMdlMetricSummary,
# model_metric_maximize=glbMdlMetricMaximize)
# Simplify a model
# fit_df <- glbObsFit; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glbObsFit, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glbMdlMetric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## id
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## All.X##rcv#glm All.X##rcv#glm
## feats
## MFO###myMFO_classfr .rnorm
## Random###myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1###glmnet ID,LocationDescription.my.fctr
## Max.cor.Y##rcv#rpart ID,LocationDescription.my.fctr
## Max.cor.Y.Time.Poly##rcv#glmnet ID,LocationDescription.my.fctr,Date.day.minutes.poly.1,Date.day.minutes.poly.2,Date.day.minutes.poly.3,Date.day.minutes.poly.4,Date.day.minutes.poly.5
## Max.cor.Y.Time.Lag##rcv#glmnet ID,LocationDescription.my.fctr,Date.last2.log1p,Date.last4.log1p,Date.last8.log1p,Date.last16.log1p,Date.last32.log1p
## Interact.High.cor.Y##rcv#glmnet ID,LocationDescription.my.fctr,ID:ID,ID:.pos,ID:Date.month.fctr,ID:Date.year.fctr
## Low.cor.X##rcv#glmnet .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## All.X##rcv#glmnet .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## All.X##rcv#glm .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## max.nTuningRuns min.elapsedtime.everything
## MFO###myMFO_classfr 0 1.460
## Random###myrandom_classfr 0 0.608
## Max.cor.Y.rcv.1X1###glmnet 0 4.977
## Max.cor.Y##rcv#rpart 5 78.814
## Max.cor.Y.Time.Poly##rcv#glmnet 20 205.599
## Max.cor.Y.Time.Lag##rcv#glmnet 20 215.736
## Interact.High.cor.Y##rcv#glmnet 25 331.147
## Low.cor.X##rcv#glmnet 25 615.615
## All.X##rcv#glmnet 25 604.994
## All.X##rcv#glm 1 155.215
## min.elapsedtime.final max.AUCpROC.fit
## MFO###myMFO_classfr 0.043 0.5000000
## Random###myrandom_classfr 0.042 0.5008987
## Max.cor.Y.rcv.1X1###glmnet 3.256 0.5000000
## Max.cor.Y##rcv#rpart 3.236 0.5000000
## Max.cor.Y.Time.Poly##rcv#glmnet 4.400 0.5000000
## Max.cor.Y.Time.Lag##rcv#glmnet 5.030 0.5000000
## Interact.High.cor.Y##rcv#glmnet 8.347 0.5000000
## Low.cor.X##rcv#glmnet 16.334 0.5000000
## All.X##rcv#glmnet 16.301 0.5000000
## All.X##rcv#glm 16.581 0.5000000
## max.Sens.fit max.Spec.fit max.AUCROCR.fit
## MFO###myMFO_classfr 1.0000000 0.00000000 0.5000000
## Random###myrandom_classfr 0.9191507 0.08264676 0.4996512
## Max.cor.Y.rcv.1X1###glmnet 1.0000000 0.00000000 0.6119694
## Max.cor.Y##rcv#rpart 1.0000000 0.00000000 0.5000000
## Max.cor.Y.Time.Poly##rcv#glmnet 1.0000000 0.00000000 0.6151250
## Max.cor.Y.Time.Lag##rcv#glmnet 1.0000000 0.00000000 0.6148796
## Interact.High.cor.Y##rcv#glmnet 1.0000000 0.00000000 0.6134441
## Low.cor.X##rcv#glmnet 1.0000000 0.00000000 0.6243901
## All.X##rcv#glmnet 1.0000000 0.00000000 0.6243901
## All.X##rcv#glm 1.0000000 0.00000000 0.6283329
## opt.prob.threshold.fit max.f.score.fit
## MFO###myMFO_classfr 0.0 0.1499867
## Random###myrandom_classfr 0.0 0.1499867
## Max.cor.Y.rcv.1X1###glmnet 0.1 0.1740375
## Max.cor.Y##rcv#rpart 0.0 0.1499867
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1 0.1770876
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1 0.1757976
## Interact.High.cor.Y##rcv#glmnet 0.1 0.1694825
## Low.cor.X##rcv#glmnet 0.1 0.1838306
## All.X##rcv#glmnet 0.1 0.1838306
## All.X##rcv#glm 0.1 0.1885223
## max.Accuracy.fit max.AccuracyLower.fit
## MFO###myMFO_classfr 0.08107332 0.07985499
## Random###myrandom_classfr 0.08107332 0.07985499
## Max.cor.Y.rcv.1X1###glmnet 0.73198211 0.72999288
## Max.cor.Y##rcv#rpart 0.91855269 0.07985499
## Max.cor.Y.Time.Poly##rcv#glmnet 0.91892668 0.75528694
## Max.cor.Y.Time.Lag##rcv#glmnet 0.91892668 0.75934736
## Interact.High.cor.Y##rcv#glmnet 0.91892668 0.78550529
## Low.cor.X##rcv#glmnet 0.91892668 0.75024836
## All.X##rcv#glmnet 0.91892668 0.75024836
## All.X##rcv#glm 0.91892668 0.73733172
## max.AccuracyUpper.fit max.Kappa.fit
## MFO###myMFO_classfr 0.08230435 0.000000000
## Random###myrandom_classfr 0.08230435 0.000000000
## Max.cor.Y.rcv.1X1###glmnet 0.73396432 0.059659401
## Max.cor.Y##rcv#rpart 0.08230435 0.002468544
## Max.cor.Y.Time.Poly##rcv#glmnet 0.75913159 0.000000000
## Max.cor.Y.Time.Lag##rcv#glmnet 0.76317003 0.000000000
## Interact.High.cor.Y##rcv#glmnet 0.78917460 0.000000000
## Low.cor.X##rcv#glmnet 0.75411965 0.000000000
## All.X##rcv#glmnet 0.75411965 0.000000000
## All.X##rcv#glm 0.74126813 0.000000000
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## MFO###myMFO_classfr 0.5000000 1.0000000 0.00000000
## Random###myrandom_classfr 0.4998215 0.9191847 0.08045829
## Max.cor.Y.rcv.1X1###glmnet 0.5000000 1.0000000 0.00000000
## Max.cor.Y##rcv#rpart 0.5000000 1.0000000 0.00000000
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5000000 1.0000000 0.00000000
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5000000 1.0000000 0.00000000
## Interact.High.cor.Y##rcv#glmnet 0.5000000 1.0000000 0.00000000
## Low.cor.X##rcv#glmnet 0.5000000 1.0000000 0.00000000
## All.X##rcv#glmnet 0.5000000 1.0000000 0.00000000
## All.X##rcv#glm 0.5000000 1.0000000 0.00000000
## max.AUCROCR.OOB opt.prob.threshold.OOB
## MFO###myMFO_classfr 0.5000000 0.0
## Random###myrandom_classfr 0.4997988 0.0
## Max.cor.Y.rcv.1X1###glmnet 0.6119694 0.1
## Max.cor.Y##rcv#rpart 0.5000000 0.0
## Max.cor.Y.Time.Poly##rcv#glmnet 0.6151250 0.1
## Max.cor.Y.Time.Lag##rcv#glmnet 0.6170827 0.1
## Interact.High.cor.Y##rcv#glmnet 0.5522346 0.1
## Low.cor.X##rcv#glmnet 0.6249179 0.1
## All.X##rcv#glmnet 0.6249179 0.1
## All.X##rcv#glm 0.6259209 0.0
## max.f.score.OOB max.Accuracy.OOB
## MFO###myMFO_classfr 0.1499867 0.08107332
## Random###myrandom_classfr 0.1499867 0.08107332
## Max.cor.Y.rcv.1X1###glmnet 0.1740375 0.73198211
## Max.cor.Y##rcv#rpart 0.1499867 0.08107332
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1770876 0.75721316
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1716109 0.79132595
## Interact.High.cor.Y##rcv#glmnet 0.1499867 0.08107332
## Low.cor.X##rcv#glmnet 0.1813771 0.44703046
## All.X##rcv#glmnet 0.1813771 0.44703046
## All.X##rcv#glm 0.1499867 0.08107332
## max.AccuracyLower.OOB
## MFO###myMFO_classfr 0.07985499
## Random###myrandom_classfr 0.07985499
## Max.cor.Y.rcv.1X1###glmnet 0.72999288
## Max.cor.Y##rcv#rpart 0.07985499
## Max.cor.Y.Time.Poly##rcv#glmnet 0.75528694
## Max.cor.Y.Time.Lag##rcv#glmnet 0.78949953
## Interact.High.cor.Y##rcv#glmnet 0.07985499
## Low.cor.X##rcv#glmnet 0.44480261
## All.X##rcv#glmnet 0.44480261
## All.X##rcv#glm 0.07985499
## max.AccuracyUpper.OOB max.Kappa.OOB
## MFO###myMFO_classfr 0.08230435 0.00000000
## Random###myrandom_classfr 0.08230435 0.00000000
## Max.cor.Y.rcv.1X1###glmnet 0.73396432 0.05965940
## Max.cor.Y##rcv#rpart 0.08230435 0.00000000
## Max.cor.Y.Time.Poly##rcv#glmnet 0.75913159 0.06742643
## Max.cor.Y.Time.Lag##rcv#glmnet 0.79314355 0.06926659
## Interact.High.cor.Y##rcv#glmnet 0.08230435 0.00000000
## Low.cor.X##rcv#glmnet 0.44925992 0.04513121
## All.X##rcv#glmnet 0.44925992 0.04513121
## All.X##rcv#glm 0.08230435 0.00000000
## max.AccuracySD.fit max.KappaSD.fit
## MFO###myMFO_classfr NA NA
## Random###myrandom_classfr NA NA
## Max.cor.Y.rcv.1X1###glmnet NA NA
## Max.cor.Y##rcv#rpart 1.590894e-04 0.001228904
## Max.cor.Y.Time.Poly##rcv#glmnet 7.220992e-06 0.000000000
## Max.cor.Y.Time.Lag##rcv#glmnet 7.220992e-06 0.000000000
## Interact.High.cor.Y##rcv#glmnet 7.220992e-06 0.000000000
## Low.cor.X##rcv#glmnet 7.220992e-06 0.000000000
## All.X##rcv#glmnet 7.220992e-06 0.000000000
## All.X##rcv#glm 7.220992e-06 0.000000000
rm(ret_lst)
fit.models_1_chunk_df <-
myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end", major.inc = FALSE,
label.minor = "teardown")
## label step_major step_minor label_minor bgn end
## 5 fit.models_1_preProc 1 4 preProc 5330.678 5330.954
## 6 fit.models_1_end 1 5 teardown 5330.954 NA
## elapsed
## 5 0.276
## 6 NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc = FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 17 fit.models 8 1 1 3901.073 5330.965 1429.892
## 18 fit.models 8 2 2 5330.965 NA NA
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 setup 5379.276 NA NA
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## id
## MFO###myMFO_classfr MFO###myMFO_classfr
## Random###myrandom_classfr Random###myrandom_classfr
## Max.cor.Y.rcv.1X1###glmnet Max.cor.Y.rcv.1X1###glmnet
## Max.cor.Y##rcv#rpart Max.cor.Y##rcv#rpart
## Max.cor.Y.Time.Poly##rcv#glmnet Max.cor.Y.Time.Poly##rcv#glmnet
## Max.cor.Y.Time.Lag##rcv#glmnet Max.cor.Y.Time.Lag##rcv#glmnet
## Interact.High.cor.Y##rcv#glmnet Interact.High.cor.Y##rcv#glmnet
## Low.cor.X##rcv#glmnet Low.cor.X##rcv#glmnet
## All.X##rcv#glmnet All.X##rcv#glmnet
## All.X##rcv#glm All.X##rcv#glm
## feats
## MFO###myMFO_classfr .rnorm
## Random###myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1###glmnet ID,LocationDescription.my.fctr
## Max.cor.Y##rcv#rpart ID,LocationDescription.my.fctr
## Max.cor.Y.Time.Poly##rcv#glmnet ID,LocationDescription.my.fctr,Date.day.minutes.poly.1,Date.day.minutes.poly.2,Date.day.minutes.poly.3,Date.day.minutes.poly.4,Date.day.minutes.poly.5
## Max.cor.Y.Time.Lag##rcv#glmnet ID,LocationDescription.my.fctr,Date.last2.log1p,Date.last4.log1p,Date.last8.log1p,Date.last16.log1p,Date.last32.log1p
## Interact.High.cor.Y##rcv#glmnet ID,LocationDescription.my.fctr,ID:ID,ID:.pos,ID:Date.month.fctr,ID:Date.year.fctr
## Low.cor.X##rcv#glmnet .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## All.X##rcv#glmnet .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## All.X##rcv#glm .pos,.pos.y,LocationDescription.my.fctr,Date.minute.fctr,Date.day.minutes.poly.5,Date.wkend,Date.last2.log1p,Date.day.minutes.poly.1,Date.hour.fctr,Date.day.minutes.poly.4,Date.last16.log1p,Date.last32.log1p,Date.date.fctr,Beat,Date.last8.log1p,Date.last4.log1p,Date.wkday.fctr,.rnorm,Date.day.minutes.poly.3,Date.juliandate,Date.month.fctr,Date.day.minutes.poly.2,District.fctr,Year,Date.year.fctr,ID
## max.nTuningRuns max.AUCpROC.fit
## MFO###myMFO_classfr 0 0.5000000
## Random###myrandom_classfr 0 0.5008987
## Max.cor.Y.rcv.1X1###glmnet 0 0.5000000
## Max.cor.Y##rcv#rpart 5 0.5000000
## Max.cor.Y.Time.Poly##rcv#glmnet 20 0.5000000
## Max.cor.Y.Time.Lag##rcv#glmnet 20 0.5000000
## Interact.High.cor.Y##rcv#glmnet 25 0.5000000
## Low.cor.X##rcv#glmnet 25 0.5000000
## All.X##rcv#glmnet 25 0.5000000
## All.X##rcv#glm 1 0.5000000
## max.Sens.fit max.Spec.fit max.AUCROCR.fit
## MFO###myMFO_classfr 1.0000000 0.00000000 0.5000000
## Random###myrandom_classfr 0.9191507 0.08264676 0.4996512
## Max.cor.Y.rcv.1X1###glmnet 1.0000000 0.00000000 0.6119694
## Max.cor.Y##rcv#rpart 1.0000000 0.00000000 0.5000000
## Max.cor.Y.Time.Poly##rcv#glmnet 1.0000000 0.00000000 0.6151250
## Max.cor.Y.Time.Lag##rcv#glmnet 1.0000000 0.00000000 0.6148796
## Interact.High.cor.Y##rcv#glmnet 1.0000000 0.00000000 0.6134441
## Low.cor.X##rcv#glmnet 1.0000000 0.00000000 0.6243901
## All.X##rcv#glmnet 1.0000000 0.00000000 0.6243901
## All.X##rcv#glm 1.0000000 0.00000000 0.6283329
## opt.prob.threshold.fit max.f.score.fit
## MFO###myMFO_classfr 0.0 0.1499867
## Random###myrandom_classfr 0.0 0.1499867
## Max.cor.Y.rcv.1X1###glmnet 0.1 0.1740375
## Max.cor.Y##rcv#rpart 0.0 0.1499867
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1 0.1770876
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1 0.1757976
## Interact.High.cor.Y##rcv#glmnet 0.1 0.1694825
## Low.cor.X##rcv#glmnet 0.1 0.1838306
## All.X##rcv#glmnet 0.1 0.1838306
## All.X##rcv#glm 0.1 0.1885223
## max.Accuracy.fit max.Kappa.fit
## MFO###myMFO_classfr 0.08107332 0.000000000
## Random###myrandom_classfr 0.08107332 0.000000000
## Max.cor.Y.rcv.1X1###glmnet 0.73198211 0.059659401
## Max.cor.Y##rcv#rpart 0.91855269 0.002468544
## Max.cor.Y.Time.Poly##rcv#glmnet 0.91892668 0.000000000
## Max.cor.Y.Time.Lag##rcv#glmnet 0.91892668 0.000000000
## Interact.High.cor.Y##rcv#glmnet 0.91892668 0.000000000
## Low.cor.X##rcv#glmnet 0.91892668 0.000000000
## All.X##rcv#glmnet 0.91892668 0.000000000
## All.X##rcv#glm 0.91892668 0.000000000
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB
## MFO###myMFO_classfr 0.5000000 1.0000000 0.00000000
## Random###myrandom_classfr 0.4998215 0.9191847 0.08045829
## Max.cor.Y.rcv.1X1###glmnet 0.5000000 1.0000000 0.00000000
## Max.cor.Y##rcv#rpart 0.5000000 1.0000000 0.00000000
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5000000 1.0000000 0.00000000
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5000000 1.0000000 0.00000000
## Interact.High.cor.Y##rcv#glmnet 0.5000000 1.0000000 0.00000000
## Low.cor.X##rcv#glmnet 0.5000000 1.0000000 0.00000000
## All.X##rcv#glmnet 0.5000000 1.0000000 0.00000000
## All.X##rcv#glm 0.5000000 1.0000000 0.00000000
## max.AUCROCR.OOB opt.prob.threshold.OOB
## MFO###myMFO_classfr 0.5000000 0.0
## Random###myrandom_classfr 0.4997988 0.0
## Max.cor.Y.rcv.1X1###glmnet 0.6119694 0.1
## Max.cor.Y##rcv#rpart 0.5000000 0.0
## Max.cor.Y.Time.Poly##rcv#glmnet 0.6151250 0.1
## Max.cor.Y.Time.Lag##rcv#glmnet 0.6170827 0.1
## Interact.High.cor.Y##rcv#glmnet 0.5522346 0.1
## Low.cor.X##rcv#glmnet 0.6249179 0.1
## All.X##rcv#glmnet 0.6249179 0.1
## All.X##rcv#glm 0.6259209 0.0
## max.f.score.OOB max.Accuracy.OOB
## MFO###myMFO_classfr 0.1499867 0.08107332
## Random###myrandom_classfr 0.1499867 0.08107332
## Max.cor.Y.rcv.1X1###glmnet 0.1740375 0.73198211
## Max.cor.Y##rcv#rpart 0.1499867 0.08107332
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1770876 0.75721316
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1716109 0.79132595
## Interact.High.cor.Y##rcv#glmnet 0.1499867 0.08107332
## Low.cor.X##rcv#glmnet 0.1813771 0.44703046
## All.X##rcv#glmnet 0.1813771 0.44703046
## All.X##rcv#glm 0.1499867 0.08107332
## max.Kappa.OOB inv.elapsedtime.everything
## MFO###myMFO_classfr 0.00000000 0.684931507
## Random###myrandom_classfr 0.00000000 1.644736842
## Max.cor.Y.rcv.1X1###glmnet 0.05965940 0.200924252
## Max.cor.Y##rcv#rpart 0.00000000 0.012688101
## Max.cor.Y.Time.Poly##rcv#glmnet 0.06742643 0.004863837
## Max.cor.Y.Time.Lag##rcv#glmnet 0.06926659 0.004635295
## Interact.High.cor.Y##rcv#glmnet 0.00000000 0.003019807
## Low.cor.X##rcv#glmnet 0.04513121 0.001624392
## All.X##rcv#glmnet 0.04513121 0.001652909
## All.X##rcv#glm 0.00000000 0.006442676
## inv.elapsedtime.final
## MFO###myMFO_classfr 23.25581395
## Random###myrandom_classfr 23.80952381
## Max.cor.Y.rcv.1X1###glmnet 0.30712531
## Max.cor.Y##rcv#rpart 0.30902349
## Max.cor.Y.Time.Poly##rcv#glmnet 0.22727273
## Max.cor.Y.Time.Lag##rcv#glmnet 0.19880716
## Interact.High.cor.Y##rcv#glmnet 0.11980352
## Low.cor.X##rcv#glmnet 0.06122199
## All.X##rcv#glmnet 0.06134593
## All.X##rcv#glm 0.06030999
# print(myplot_radar(radar_inp_df=plt_models_df))
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(mdl_id %in% grep("random|MFO", plt_models_df$id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "id", FALSE]
pltCI_models_df <- glb_models_df[, "id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
# mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("id", "model_method")],
# all.x=TRUE)
png(paste0(glbOut$pfx, "models_bar.png"), width=480*3, height=480*2)
#print(gp <- myplot_bar(mltd_models_df, "id", "value", colorcol_name="model_method") +
print(gp <- myplot_bar(df=mltd_models_df, xcol_name="id", ycol_names="value") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=mdl_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
## Warning: Removed 4 rows containing missing values (geom_errorbar).
dev.off()
## quartz_off_screen
## 2
print(gp)
## Warning: Removed 4 rows containing missing values (geom_errorbar).
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
# if (glb_is_classification && glb_is_binomial)
# dsp_models_cols <- c(dsp_models_cols, "opt.prob.threshold.OOB")
print(dsp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols])
## id max.Accuracy.OOB max.AUCROCR.OOB
## 6 Max.cor.Y.Time.Lag##rcv#glmnet 0.79132595 0.6170827
## 5 Max.cor.Y.Time.Poly##rcv#glmnet 0.75721316 0.6151250
## 3 Max.cor.Y.rcv.1X1###glmnet 0.73198211 0.6119694
## 8 Low.cor.X##rcv#glmnet 0.44703046 0.6249179
## 9 All.X##rcv#glmnet 0.44703046 0.6249179
## 10 All.X##rcv#glm 0.08107332 0.6259209
## 7 Interact.High.cor.Y##rcv#glmnet 0.08107332 0.5522346
## 4 Max.cor.Y##rcv#rpart 0.08107332 0.5000000
## 1 MFO###myMFO_classfr 0.08107332 0.5000000
## 2 Random###myrandom_classfr 0.08107332 0.4997988
## max.AUCpROC.OOB max.Accuracy.fit opt.prob.threshold.fit
## 6 0.5000000 0.91892668 0.1
## 5 0.5000000 0.91892668 0.1
## 3 0.5000000 0.73198211 0.1
## 8 0.5000000 0.91892668 0.1
## 9 0.5000000 0.91892668 0.1
## 10 0.5000000 0.91892668 0.1
## 7 0.5000000 0.91892668 0.1
## 4 0.5000000 0.91855269 0.0
## 1 0.5000000 0.08107332 0.0
## 2 0.4998215 0.08107332 0.0
## opt.prob.threshold.OOB
## 6 0.1
## 5 0.1
## 3 0.1
## 8 0.1
## 9 0.1
## 10 0.0
## 7 0.1
## 4 0.0
## 1 0.0
## 2 0.0
# print(myplot_radar(radar_inp_df = dsp_models_df))
print("Metrics used for model selection:"); print(get_model_sel_frmla())
## [1] "Metrics used for model selection:"
## ~-max.Accuracy.OOB - max.AUCROCR.OOB - max.AUCpROC.OOB - max.Accuracy.fit -
## opt.prob.threshold.OOB
## <environment: 0x7fda8dd622e0>
print(sprintf("Best model id: %s", dsp_models_df[1, "id"]))
## [1] "Best model id: Max.cor.Y.Time.Lag##rcv#glmnet"
glb_get_predictions <- function(df, mdl_id, rsp_var, prob_threshold_def=NULL, verbose=FALSE) {
mdl <- glb_models_lst[[mdl_id]]
clmnNames <- mygetPredictIds(rsp_var, mdl_id)
predct_var_name <- clmnNames$value
predct_prob_var_name <- clmnNames$prob
predct_accurate_var_name <- clmnNames$is.acc
predct_error_var_name <- clmnNames$err
predct_erabs_var_name <- clmnNames$err.abs
if (glb_is_regression) {
df[, predct_var_name] <- predict(mdl, newdata=df, type="raw")
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] - df[, glb_rsp_var]
if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="auto"))
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
#facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
stat_smooth(method="glm"))
df[, predct_erabs_var_name] <- abs(df[, predct_error_var_name])
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, predct_prob_var_name] <- predict(mdl, newdata = df, type = "prob")[, 2]
df[, predct_var_name] <-
factor(levels(df[, glb_rsp_var])[
(df[, predct_prob_var_name] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
# facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="auto"))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
# #facet_wrap(reformulate(glbFeatsCategory), scales = "free") +
# stat_smooth(method="glm"))
# if prediction is a TP (true +ve), measure distance from 1.0
tp <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[tp, predct_erabs_var_name] <- abs(1 - df[tp, predct_prob_var_name])
#rowIx <- which.max(df[tp, predct_erabs_var_name]); df[tp, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a TN (true -ve), measure distance from 0.0
tn <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[tn, predct_erabs_var_name] <- abs(0 - df[tn, predct_prob_var_name])
#rowIx <- which.max(df[tn, predct_erabs_var_name]); df[tn, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FP (flse +ve), measure distance from 0.0
fp <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[fp, predct_erabs_var_name] <- abs(0 - df[fp, predct_prob_var_name])
#rowIx <- which.max(df[fp, predct_erabs_var_name]); df[fp, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FN (flse -ve), measure distance from 1.0
fn <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[fn, predct_erabs_var_name] <- abs(1 - df[fn, predct_prob_var_name])
#rowIx <- which.max(df[fn, predct_erabs_var_name]); df[fn, c(glbFeatsId, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && !glb_is_binomial) {
df[, predct_var_name] <- predict(mdl, newdata = df, type = "raw")
probCls <- predict(mdl, newdata = df, type = "prob")
df[, predct_prob_var_name] <- NA
for (cls in names(probCls)) {
mask <- (df[, predct_var_name] == cls)
df[mask, predct_prob_var_name] <- probCls[mask, cls]
}
if (verbose) print(myplot_histogram(df, predct_prob_var_name,
fill_col_name = predct_var_name))
if (verbose) print(myplot_histogram(df, predct_prob_var_name,
facet_frmla = paste0("~", glb_rsp_var)))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if prediction is erroneous, measure predicted class prob from actual class prob
df[, predct_erabs_var_name] <- 0
for (cls in names(probCls)) {
mask <- (df[, glb_rsp_var] == cls) & (df[, predct_error_var_name])
df[mask, predct_erabs_var_name] <- probCls[mask, cls]
}
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
return(df)
}
#stop(here"); glb2Sav(); glbObsAll <- savObsAll; glbObsTrn <- savObsTrn; glbObsFit <- savObsFit; glbObsOOB <- savObsOOB; sav_models_df <- glb_models_df; glb_models_df <- sav_models_df; glb_featsimp_df <- sav_featsimp_df
myget_category_stats <- function(obs_df, mdl_id, label) {
require(dplyr)
require(lazyeval)
predct_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$value
predct_error_var_name <- mygetPredictIds(glb_rsp_var, mdl_id)$err.abs
if (!predct_var_name %in% names(obs_df))
obs_df <- glb_get_predictions(obs_df, mdl_id, glb_rsp_var)
tmp_obs_df <- obs_df[, c(glbFeatsCategory, glb_rsp_var,
predct_var_name, predct_error_var_name)]
# tmp_obs_df <- obs_df %>%
# dplyr::select_(glbFeatsCategory, glb_rsp_var, predct_var_name, predct_error_var_name)
#dplyr::rename(startprice.log10.predict.RFE.X.glmnet.err=error_abs_OOB)
names(tmp_obs_df)[length(names(tmp_obs_df))] <- paste0("err.abs.", label)
ret_ctgry_df <- tmp_obs_df %>%
dplyr::group_by_(glbFeatsCategory) %>%
dplyr::summarise_(#interp(~sum(abs(var)), var=as.name(glb_rsp_var)),
interp(~sum(var), var=as.name(paste0("err.abs.", label))),
interp(~mean(var), var=as.name(paste0("err.abs.", label))),
interp(~n()))
names(ret_ctgry_df) <- c(glbFeatsCategory,
#paste0(glb_rsp_var, ".abs.", label, ".sum"),
paste0("err.abs.", label, ".sum"),
paste0("err.abs.", label, ".mean"),
paste0(".n.", label))
ret_ctgry_df <- dplyr::ungroup(ret_ctgry_df)
#colSums(ret_ctgry_df[, -grep(glbFeatsCategory, names(ret_ctgry_df))])
return(ret_ctgry_df)
}
#print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
if (!is.null(glb_mdl_ensemble)) {
fit.models_2_chunk_df <- myadd_chunk(fit.models_2_chunk_df,
paste0("fit.models_2_", mdl_id_pfx), major.inc = TRUE,
label.minor = "ensemble")
mdl_id_pfx <- "Ensemble"
if (#(glb_is_regression) |
((glb_is_classification) & (!glb_is_binomial)))
stop("Ensemble models not implemented yet for multinomial classification")
mygetEnsembleAutoMdlIds <- function() {
tmp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)
row.names(tmp_models_df) <- tmp_models_df$id
mdl_threshold_pos <-
min(which(grepl("MFO|Random|Baseline", tmp_models_df$id))) - 1
mdlIds <- tmp_models_df$id[1:mdl_threshold_pos]
return(mdlIds[!grepl("Ensemble", mdlIds)])
}
if (glb_mdl_ensemble == "auto") {
glb_mdl_ensemble <- mygetEnsembleAutoMdlIds()
mdl_id_pfx <- paste0(mdl_id_pfx, ".auto")
} else if (grepl("^%<d-%", glb_mdl_ensemble)) {
glb_mdl_ensemble <- eval(parse(text =
str_trim(unlist(strsplit(glb_mdl_ensemble, "%<d-%"))[2])))
}
for (mdl_id in glb_mdl_ensemble) {
if (!(mdl_id %in% names(glb_models_lst))) {
warning("Model ", mdl_id, " in glb_model_ensemble not found !")
next
}
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id, glb_rsp_var)
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id, glb_rsp_var)
}
#mdl_id_pfx <- "Ensemble.RFE"; mdlId <- paste0(mdl_id_pfx, ".glmnet")
#glb_mdl_ensemble <- gsub(mygetPredictIds$value, "", grep("RFE\\.X\\.(?!Interact)", row.names(glb_featsimp_df), perl = TRUE, value = TRUE), fixed = TRUE)
#varImp(glb_models_lst[[mdlId]])
#cor_df <- data.frame(cor=cor(glbObsFit[, glb_rsp_var], glbObsFit[, paste(mygetPredictIds$value, glb_mdl_ensemble)], use="pairwise.complete.obs"))
#glbObsFit <- glb_get_predictions(df=glbObsFit, "Ensemble.glmnet", glb_rsp_var);print(colSums((ctgry_df <- myget_category_stats(obs_df=glbObsFit, mdl_id="Ensemble.glmnet", label="fit"))[, -grep(glbFeatsCategory, names(ctgry_df))]))
### bid0_sp
# Better than MFO; models.n=28; min.RMSE.fit=0.0521233; err.abs.fit.sum=7.3631895
# old: Top x from auto; models.n= 5; min.RMSE.fit=0.06311047; err.abs.fit.sum=9.5937080
# RFE only ; models.n=16; min.RMSE.fit=0.05148588; err.abs.fit.sum=7.2875091
# RFE subset only ;models.n= 5; min.RMSE.fit=0.06040702; err.abs.fit.sum=9.059088
# RFE subset only ;models.n= 9; min.RMSE.fit=0.05933167; err.abs.fit.sum=8.7421288
# RFE subset only ;models.n=15; min.RMSE.fit=0.0584607; err.abs.fit.sum=8.5902066
# RFE subset only ;models.n=17; min.RMSE.fit=0.05496899; err.abs.fit.sum=8.0170431
# RFE subset only ;models.n=18; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
# RFE subset only ;models.n=16; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
### bid0_sp
### bid1_sp
# "auto"; err.abs.fit.sum=76.699774; min.RMSE.fit=0.2186429
# "RFE.X.*"; err.abs.fit.sum=; min.RMSE.fit=0.221114
### bid1_sp
indepVar <- paste(mygetPredictIds(glb_rsp_var)$value, glb_mdl_ensemble, sep = "")
if (glb_is_classification)
indepVar <- paste(indepVar, ".prob", sep = "")
# Some models in glb_mdl_ensemble might not be fitted e.g. RFE.X.Interact
indepVar <- intersect(indepVar, names(glbObsFit))
# indepVar <- grep(mygetPredictIds(glb_rsp_var)$value, names(glbObsFit), fixed=TRUE, value=TRUE)
# if (glb_is_regression)
# indepVar <- indepVar[!grepl("(err\\.abs|accurate)$", indepVar)]
# if (glb_is_classification && glb_is_binomial)
# indepVar <- grep("prob$", indepVar, value=TRUE) else
# indepVar <- indepVar[!grepl("err$", indepVar)]
#rfe_fit_ens_results <- myrun_rfe(glbObsFit, indepVar)
for (method in c("glm", "glmnet")) {
for (trainControlMethod in
c("boot", "boot632", "cv", "repeatedcv"
#, "LOOCV" # tuneLength * nrow(fitDF)
, "LGOCV", "adaptive_cv"
#, "adaptive_boot" #error: adaptive$min should be less than 3
#, "adaptive_LGOCV" #error: adaptive$min should be less than 3
)) {
#sav_models_df <- glb_models_df; all.equal(sav_models_df, glb_models_df)
#glb_models_df <- sav_models_df; print(glb_models_df$id)
if ((method == "glm") && (trainControlMethod != "repeatedcv"))
# glm used only to identify outliers
next
ret_lst <- myfit_mdl(
mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = paste0(mdl_id_pfx, ".", trainControlMethod),
type = glb_model_type, tune.df = NULL,
trainControl.method = trainControlMethod,
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indepVar = indepVar, rsp_var = glb_rsp_var,
fit_df = glbObsFit, OOB_df = glbObsOOB)
}
}
dsp_models_df <- get_dsp_models_df()
}
if (is.null(glb_sel_mdl_id))
glb_sel_mdl_id <- dsp_models_df[1, "id"] else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
## [1] "User specified selection: All.X##rcv#glmnet"
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
## Length Class Mode
## a0 59 -none- numeric
## beta 4189 dgCMatrix S4
## df 59 -none- numeric
## dim 2 -none- numeric
## lambda 59 -none- numeric
## dev.ratio 59 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 71 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## 3.457069e+01
## .pos
## 9.288549e-07
## .pos.y
## 9.483449e-07
## .rnorm
## -3.174615e-03
## Date.date.fctr(13,19]
## -1.357861e-02
## Date.day.minutes.poly.1
## 2.834088e+00
## Date.day.minutes.poly.2
## -2.985070e+01
## Date.day.minutes.poly.3
## -8.434349e+00
## Date.day.minutes.poly.5
## 6.293047e+00
## Date.last2.log1p
## 3.223433e-03
## Date.last32.log1p
## 6.644769e-02
## Date.minute.fctr(14.8,29.5]
## 2.985276e-01
## Date.minute.fctr(29.5,44.2]
## 5.300126e-02
## Date.minute.fctr(44.2,59.1]
## 3.123613e-01
## Date.month.fctr02
## 3.800956e-02
## Date.month.fctr05
## -3.011597e-02
## Date.month.fctr12
## 3.049606e-02
## Date.wkday.fctr4
## -9.609387e-03
## Date.wkend
## 3.747579e-02
## Date.year.fctr2002
## 5.772629e-02
## Date.year.fctr2003
## 6.357462e-02
## Date.year.fctr2004
## 5.326317e-02
## Date.year.fctr2005
## 3.570502e-02
## Date.year.fctr2007
## 6.819216e-02
## Date.year.fctr2010
## -1.952413e-01
## Date.year.fctr2011
## -2.339641e-01
## Date.year.fctr2012
## -2.015466e-01
## District.fctr10-19
## 1.800929e-02
## ID
## -3.441064e-08
## LocationDescription.my.fctrALLEY
## 1.611979e-01
## LocationDescription.my.fctrCommercialVehicle
## 7.837998e-01
## LocationDescription.my.fctrEntertainment
## 2.697141e-01
## LocationDescription.my.fctrGAS STATION
## 9.136771e-01
## LocationDescription.my.fctrGovernment
## 5.429015e-01
## LocationDescription.my.fctrOther
## 2.440159e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.004673e-01
## LocationDescription.my.fctrResidence
## 1.669185e-01
## LocationDescription.my.fctrSchool
## 4.610375e-01
## Year
## -1.887036e-02
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## 3.420387e+01
## .pos
## 9.190503e-07
## .pos.y
## 9.397726e-07
## .rnorm
## -4.415502e-03
## Date.date.fctr(13,19]
## -1.684284e-02
## Date.day.minutes.poly.1
## 3.635232e+00
## Date.day.minutes.poly.2
## -3.041163e+01
## Date.day.minutes.poly.3
## -9.343094e+00
## Date.day.minutes.poly.5
## 6.818306e+00
## Date.last16.log1p
## 1.566097e-04
## Date.last2.log1p
## 3.519131e-03
## Date.last32.log1p
## 6.809740e-02
## Date.minute.fctr(14.8,29.5]
## 3.059729e-01
## Date.minute.fctr(29.5,44.2]
## 5.756666e-02
## Date.minute.fctr(44.2,59.1]
## 3.197748e-01
## Date.month.fctr02
## 4.257876e-02
## Date.month.fctr05
## -3.417938e-02
## Date.month.fctr12
## 3.517850e-02
## Date.wkday.fctr4
## -1.248810e-02
## Date.wkend
## 4.013331e-02
## Date.year.fctr2002
## 6.405933e-02
## Date.year.fctr2003
## 6.875965e-02
## Date.year.fctr2004
## 5.821583e-02
## Date.year.fctr2005
## 4.065109e-02
## Date.year.fctr2007
## 7.351063e-02
## Date.year.fctr2010
## -2.042294e-01
## Date.year.fctr2011
## -2.444259e-01
## Date.year.fctr2012
## -2.127867e-01
## District.fctr10-19
## 2.173914e-02
## ID
## -3.475309e-08
## LocationDescription.my.fctrALLEY
## 1.729817e-01
## LocationDescription.my.fctrCommercialVehicle
## 8.020796e-01
## LocationDescription.my.fctrEntertainment
## 2.912019e-01
## LocationDescription.my.fctrGAS STATION
## 9.247693e-01
## LocationDescription.my.fctrGovernment
## 5.662629e-01
## LocationDescription.my.fctrOther
## 2.529958e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.)
## 2.063057e-01
## LocationDescription.my.fctrResidence
## 1.784438e-01
## LocationDescription.my.fctrSchool
## 4.825700e-01
## Year
## -1.869923e-02
## [1] TRUE
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
print(sprintf("%s fit prediction diagnostics:", glb_sel_mdl_id))
## [1] "All.X##rcv#glmnet fit prediction diagnostics:"
glbObsFit <- glb_get_predictions(df = glbObsFit, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
print(sprintf("%s OOB prediction diagnostics:", glb_sel_mdl_id))
## [1] "All.X##rcv#glmnet OOB prediction diagnostics:"
glbObsOOB <- glb_get_predictions(df = glbObsOOB, mdl_id = glb_sel_mdl_id,
rsp_var = glb_rsp_var)
print(glb_featsimp_df <- myget_feats_importance(mdl = glb_sel_mdl, featsimp_df = NULL))
## All.X..rcv.glmnet.imp
## Date.day.minutes.poly.2 1.000000e+02
## Date.day.minutes.poly.3 3.025283e+01
## Date.day.minutes.poly.5 2.216548e+01
## Date.day.minutes.poly.1 1.148563e+01
## LocationDescription.my.fctrGAS STATION 3.044642e+00
## LocationDescription.my.fctrCommercialVehicle 2.635189e+00
## LocationDescription.my.fctrGovernment 1.853763e+00
## LocationDescription.my.fctrSchool 1.578745e+00
## Date.minute.fctr(44.2,59.1] 1.050523e+00
## Date.minute.fctr(14.8,29.5] 1.004957e+00
## LocationDescription.my.fctrEntertainment 9.472644e-01
## LocationDescription.my.fctrOther 8.291560e-01
## Date.year.fctr2011 7.999311e-01
## Date.year.fctr2012 6.950268e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.) 6.770819e-01
## Date.year.fctr2010 6.682230e-01
## LocationDescription.my.fctrResidence 5.815146e-01
## LocationDescription.my.fctrALLEY 5.633250e-01
## Date.year.fctr2007 2.391936e-01
## Date.last32.log1p 2.236680e-01
## Date.year.fctr2003 2.236006e-01
## Date.year.fctr2002 2.073581e-01
## Date.year.fctr2004 1.889544e-01
## Date.minute.fctr(29.5,44.2] 1.870589e-01
## Date.month.fctr02 1.375969e-01
## Date.year.fctr2005 1.309955e-01
## Date.wkend 1.307452e-01
## Date.month.fctr12 1.131041e-01
## Date.month.fctr05 1.102015e-01
## District.fctr10-19 6.936166e-02
## Year 6.181592e-02
## Date.date.fctr(13,19] 5.350073e-02
## Date.wkday.fctr4 3.937589e-02
## .rnorm 1.378027e-02
## Date.last2.log1p 1.142459e-02
## Date.last16.log1p 4.170085e-04
## .pos.y 3.106683e-06
## .pos 3.039085e-06
## ID 1.144659e-07
## Beat 0.000000e+00
## Date.date.fctr(7,13] 0.000000e+00
## Date.date.fctr(19,25] 0.000000e+00
## Date.date.fctr(25,31] 0.000000e+00
## Date.day.minutes.poly.4 0.000000e+00
## Date.hour.fctr(7.67,15.3] 0.000000e+00
## Date.hour.fctr(15.3,23] 0.000000e+00
## Date.juliandate 0.000000e+00
## Date.last4.log1p 0.000000e+00
## Date.last8.log1p 0.000000e+00
## Date.month.fctr03 0.000000e+00
## Date.month.fctr04 0.000000e+00
## Date.month.fctr06 0.000000e+00
## Date.month.fctr07 0.000000e+00
## Date.month.fctr08 0.000000e+00
## Date.month.fctr09 0.000000e+00
## Date.month.fctr10 0.000000e+00
## Date.month.fctr11 0.000000e+00
## Date.wkday.fctr1 0.000000e+00
## Date.wkday.fctr2 0.000000e+00
## Date.wkday.fctr3 0.000000e+00
## Date.wkday.fctr5 0.000000e+00
## Date.wkday.fctr6 0.000000e+00
## Date.year.fctr2006 0.000000e+00
## Date.year.fctr2008 0.000000e+00
## Date.year.fctr2009 0.000000e+00
## District.fctr1-9 0.000000e+00
## District.fctr20+ 0.000000e+00
## LocationDescription.my.fctrSidewalk 0.000000e+00
## LocationDescription.my.fctrVACANT LOT/LAND 0.000000e+00
## LocationDescription.my.fctrVEHICLE NON-COMMERCIAL 0.000000e+00
## LocationDescription.my.fctrcha 0.000000e+00
## imp
## Date.day.minutes.poly.2 1.000000e+02
## Date.day.minutes.poly.3 3.025283e+01
## Date.day.minutes.poly.5 2.216548e+01
## Date.day.minutes.poly.1 1.148563e+01
## LocationDescription.my.fctrGAS STATION 3.044642e+00
## LocationDescription.my.fctrCommercialVehicle 2.635189e+00
## LocationDescription.my.fctrGovernment 1.853763e+00
## LocationDescription.my.fctrSchool 1.578745e+00
## Date.minute.fctr(44.2,59.1] 1.050523e+00
## Date.minute.fctr(14.8,29.5] 1.004957e+00
## LocationDescription.my.fctrEntertainment 9.472644e-01
## LocationDescription.my.fctrOther 8.291560e-01
## Date.year.fctr2011 7.999311e-01
## Date.year.fctr2012 6.950268e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.) 6.770819e-01
## Date.year.fctr2010 6.682230e-01
## LocationDescription.my.fctrResidence 5.815146e-01
## LocationDescription.my.fctrALLEY 5.633250e-01
## Date.year.fctr2007 2.391936e-01
## Date.last32.log1p 2.236680e-01
## Date.year.fctr2003 2.236006e-01
## Date.year.fctr2002 2.073581e-01
## Date.year.fctr2004 1.889544e-01
## Date.minute.fctr(29.5,44.2] 1.870589e-01
## Date.month.fctr02 1.375969e-01
## Date.year.fctr2005 1.309955e-01
## Date.wkend 1.307452e-01
## Date.month.fctr12 1.131041e-01
## Date.month.fctr05 1.102015e-01
## District.fctr10-19 6.936166e-02
## Year 6.181592e-02
## Date.date.fctr(13,19] 5.350073e-02
## Date.wkday.fctr4 3.937589e-02
## .rnorm 1.378027e-02
## Date.last2.log1p 1.142459e-02
## Date.last16.log1p 4.170085e-04
## .pos.y 3.106683e-06
## .pos 3.039085e-06
## ID 1.144659e-07
## Beat 0.000000e+00
## Date.date.fctr(7,13] 0.000000e+00
## Date.date.fctr(19,25] 0.000000e+00
## Date.date.fctr(25,31] 0.000000e+00
## Date.day.minutes.poly.4 0.000000e+00
## Date.hour.fctr(7.67,15.3] 0.000000e+00
## Date.hour.fctr(15.3,23] 0.000000e+00
## Date.juliandate 0.000000e+00
## Date.last4.log1p 0.000000e+00
## Date.last8.log1p 0.000000e+00
## Date.month.fctr03 0.000000e+00
## Date.month.fctr04 0.000000e+00
## Date.month.fctr06 0.000000e+00
## Date.month.fctr07 0.000000e+00
## Date.month.fctr08 0.000000e+00
## Date.month.fctr09 0.000000e+00
## Date.month.fctr10 0.000000e+00
## Date.month.fctr11 0.000000e+00
## Date.wkday.fctr1 0.000000e+00
## Date.wkday.fctr2 0.000000e+00
## Date.wkday.fctr3 0.000000e+00
## Date.wkday.fctr5 0.000000e+00
## Date.wkday.fctr6 0.000000e+00
## Date.year.fctr2006 0.000000e+00
## Date.year.fctr2008 0.000000e+00
## Date.year.fctr2009 0.000000e+00
## District.fctr1-9 0.000000e+00
## District.fctr20+ 0.000000e+00
## LocationDescription.my.fctrSidewalk 0.000000e+00
## LocationDescription.my.fctrVACANT LOT/LAND 0.000000e+00
## LocationDescription.my.fctrVEHICLE NON-COMMERCIAL 0.000000e+00
## LocationDescription.my.fctrcha 0.000000e+00
#mdl_id <-"RFE.X.glmnet"; glb_featsimp_df <- myget_feats_importance(glb_models_lst[[mdl_id]], glb_featsimp_df); glb_featsimp_df[, paste0(mdl_id, ".imp")] <- glb_featsimp_df$imp; print(glb_featsimp_df)
#print(head(sbst_featsimp_df <- subset(glb_featsimp_df, is.na(RFE.X.glmnet.imp) | (abs(RFE.X.YeoJohnson.glmnet.imp - RFE.X.glmnet.imp) > 0.0001), select=-imp)))
#print(orderBy(~ -cor.y.abs, subset(glb_feats_df, id %in% c(row.names(sbst_featsimp_df), "startprice.dcm1.is9", "D.weight.post.stop.sum"))))
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
if (!is.null(featsimp_df <- glb_featsimp_df)) {
featsimp_df$feat <- gsub("`(.*?)`", "\\1", row.names(featsimp_df))
featsimp_df$feat.interact <- gsub("(.*?):(.*)", "\\2", featsimp_df$feat)
featsimp_df$feat <- gsub("(.*?):(.*)", "\\1", featsimp_df$feat)
featsimp_df$feat.interact <-
ifelse(featsimp_df$feat.interact == featsimp_df$feat,
NA, featsimp_df$feat.interact)
featsimp_df$feat <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat)
featsimp_df$feat.interact <-
gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat.interact)
featsimp_df <- orderBy(~ -imp.max,
summaryBy(imp ~ feat + feat.interact, data=featsimp_df,
FUN=max))
#rex_str=":(.*)"; txt_vctr=tail(featsimp_df$feat); ret_lst <- regexec(rex_str, txt_vctr); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
featsimp_df <- subset(featsimp_df, !is.na(imp.max))
if (nrow(featsimp_df) > 5) {
warning("Limiting important feature scatter plots to 5 out of ",
nrow(featsimp_df))
featsimp_df <- head(featsimp_df, 5)
}
# if (!all(is.na(featsimp_df$feat.interact)))
# stop("not implemented yet")
rsp_var_out <- mygetPredictIds(glb_rsp_var, mdl_id)$value
for (var in featsimp_df$feat) {
plot_df <- melt(obs_df, id.vars = var,
measure.vars = c(glb_rsp_var, rsp_var_out))
print(myplot_scatter(plot_df, var, "value", colorcol_name = "variable",
facet_colcol_name = "variable", jitter = TRUE) +
guides(color = FALSE))
}
}
if (glb_is_regression) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glbFeatsId)
# + facet_wrap(reformulate(featsimp_df$feat[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df = obs_df,
feat_x = ifelse(nrow(featsimp_df) > 1,
featsimp_df$feat[2], ".rownames"),
feat_y = featsimp_df$feat[1],
rsp_var = glb_rsp_var,
rsp_var_out = rsp_var_out,
id_vars = glbFeatsId,
prob_threshold = prob_threshold))
}
}
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
prob_threshold = glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glbObsOOB, mdl_id =
## glb_sel_mdl_id, : Limiting important feature scatter plots to 5 out of 26
## [1] "Min/Max Boundaries: "
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 1 8812290 T 0.04960291 F
## 2 8834045 T 0.05071953 F
## 3 8864270 T 0.05257967 F
## 4 8949625 T 0.05416327 F
## 5 8868931 T 0.06127878 F
## 6 8944054 T 0.06188272 F
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 1 TRUE 0.9503971
## 2 TRUE 0.9492805
## 3 TRUE 0.9474203
## 4 TRUE 0.9458367
## 5 TRUE 0.9387212
## 6 TRUE 0.9381173
## Arrest.All.X..rcv.glmnet.is.acc Arrest.All.X..rcv.glmnet.accurate
## 1 FALSE FALSE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE FALSE
## 5 FALSE FALSE
## 6 FALSE FALSE
## Arrest.All.X..rcv.glmnet.error .label
## 1 -0.05039709 8812290
## 2 -0.04928047 8834045
## 3 -0.04742033 8864270
## 4 -0.04583673 8949625
## 5 -0.03872122 8868931
## 6 -0.03811728 8944054
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 1 8812290 T 0.04960291 F
## 5 8868931 T 0.06127878 F
## 7 8652015 T 0.08516781 F
## 10 8637721 F 0.07760630 F
## 12 6764611 F 0.10730069 T
## 18 6171778 F 0.15343180 T
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 1 TRUE 0.9503971
## 5 TRUE 0.9387212
## 7 TRUE 0.9148322
## 10 FALSE 0.0776063
## 12 TRUE 0.1073007
## 18 TRUE 0.1534318
## Arrest.All.X..rcv.glmnet.is.acc Arrest.All.X..rcv.glmnet.accurate
## 1 FALSE FALSE
## 5 FALSE FALSE
## 7 FALSE FALSE
## 10 TRUE TRUE
## 12 FALSE FALSE
## 18 FALSE FALSE
## Arrest.All.X..rcv.glmnet.error .label
## 1 -0.050397092 8812290
## 5 -0.038721221 8868931
## 7 -0.014832188 8652015
## 10 0.000000000 8637721
## 12 0.007300688 6764611
## 18 0.053431798 6171778
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 16 6553663 F 0.1234584 T
## 17 6762713 F 0.1472955 T
## 18 6171778 F 0.1534318 T
## 19 5627081 F 0.1658500 T
## 20 1814843 F 0.1838076 T
## 21 3732883 F 0.3346066 T
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 16 TRUE 0.1234584
## 17 TRUE 0.1472955
## 18 TRUE 0.1534318
## 19 TRUE 0.1658500
## 20 TRUE 0.1838076
## 21 TRUE 0.3346066
## Arrest.All.X..rcv.glmnet.is.acc Arrest.All.X..rcv.glmnet.accurate
## 16 FALSE FALSE
## 17 FALSE FALSE
## 18 FALSE FALSE
## 19 FALSE FALSE
## 20 FALSE FALSE
## 21 FALSE FALSE
## Arrest.All.X..rcv.glmnet.error .label
## 16 0.02345841 6553663
## 17 0.04729550 6762713
## 18 0.05343180 6171778
## 19 0.06585005 5627081
## 20 0.08380760 1814843
## 21 0.23460660 3732883
## [1] "Inaccurate: "
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 1 8596516 T 0.04555389 F
## 2 8872919 T 0.04765161 F
## 3 8909546 T 0.04805332 F
## 4 8811336 T 0.04805341 F
## 5 8341397 T 0.04841366 F
## 6 8337628 T 0.04855636 F
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 1 TRUE 0.9544461
## 2 TRUE 0.9523484
## 3 TRUE 0.9519467
## 4 TRUE 0.9519466
## 5 TRUE 0.9515863
## 6 TRUE 0.9514436
## Arrest.All.X..rcv.glmnet.is.acc Arrest.All.X..rcv.glmnet.accurate
## 1 FALSE FALSE
## 2 FALSE FALSE
## 3 FALSE FALSE
## 4 FALSE FALSE
## 5 FALSE FALSE
## 6 FALSE FALSE
## Arrest.All.X..rcv.glmnet.error
## 1 -0.05444611
## 2 -0.05234839
## 3 -0.05194668
## 4 -0.05194659
## 5 -0.05158634
## 6 -0.05144364
## ID Arrest Arrest.All.X..rcv.glmnet.prob
## 2186 7423197 T 0.0861755
## 7983 5615259 F 0.1023279
## 29173 7119058 F 0.1142858
## 62519 1553446 F 0.1318627
## 69552 4475467 F 0.1358597
## 73433 2248530 F 0.1382141
## Arrest.All.X..rcv.glmnet Arrest.All.X..rcv.glmnet.err
## 2186 F TRUE
## 7983 T TRUE
## 29173 T TRUE
## 62519 T TRUE
## 69552 T TRUE
## 73433 T TRUE
## Arrest.All.X..rcv.glmnet.err.abs Arrest.All.X..rcv.glmnet.is.acc
## 2186 0.9138245 FALSE
## 7983 0.1023279 FALSE
## 29173 0.1142858 FALSE
## 62519 0.1318627 FALSE
## 69552 0.1358597 FALSE
## 73433 0.1382141 FALSE
## Arrest.All.X..rcv.glmnet.accurate Arrest.All.X..rcv.glmnet.error
## 2186 FALSE -0.013824498
## 7983 FALSE 0.002327929
## 29173 FALSE 0.014285754
## 62519 FALSE 0.031862707
## 69552 FALSE 0.035859689
## 73433 FALSE 0.038214150
## ID Arrest Arrest.All.X..rcv.glmnet.prob
## 105960 1684852 F 0.3820876
## 105961 1343195 F 0.3831750
## 105962 2046674 F 0.3840201
## 105963 1320838 F 0.3864216
## 105964 1394439 F 0.3877208
## 105965 2004388 F 0.3886837
## Arrest.All.X..rcv.glmnet Arrest.All.X..rcv.glmnet.err
## 105960 T TRUE
## 105961 T TRUE
## 105962 T TRUE
## 105963 T TRUE
## 105964 T TRUE
## 105965 T TRUE
## Arrest.All.X..rcv.glmnet.err.abs Arrest.All.X..rcv.glmnet.is.acc
## 105960 0.3820876 FALSE
## 105961 0.3831750 FALSE
## 105962 0.3840201 FALSE
## 105963 0.3864216 FALSE
## 105964 0.3877208 FALSE
## 105965 0.3886837 FALSE
## Arrest.All.X..rcv.glmnet.accurate Arrest.All.X..rcv.glmnet.error
## 105960 FALSE 0.2820876
## 105961 FALSE 0.2831750
## 105962 FALSE 0.2840201
## 105963 FALSE 0.2864216
## 105964 FALSE 0.2877208
## 105965 FALSE 0.2886837
if (!is.null(glbFeatsCategory)) {
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsFit, mdl_id = glb_sel_mdl_id,
label = "fit"),
by = glbFeatsCategory, all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
glbLvlCategory <- merge(glbLvlCategory,
myget_category_stats(obs_df = glbObsOOB, mdl_id = glb_sel_mdl_id,
label="OOB"),
#by=glbFeatsCategory, all=TRUE) glb_ctgry-df already contains .n.OOB ?
all = TRUE)
row.names(glbLvlCategory) <- glbLvlCategory[, glbFeatsCategory]
if (any(grepl("OOB", glbMdlMetricsEval)))
print(orderBy(~-err.abs.OOB.mean, glbLvlCategory)) else
print(orderBy(~-err.abs.fit.mean, glbLvlCategory))
print(colSums(glbLvlCategory[, -grep(glbFeatsCategory, names(glbLvlCategory))]))
}
## LocationDescription.my .n.OOB
## GAS STATION GAS STATION 2111
## CommercialVehicle CommercialVehicle 648
## School School 415
## Government Government 320
## Other Other 5206
## Entertainment Entertainment 651
## PARKING LOT/GARAGE(NON.RESID.) PARKING LOT/GARAGE(NON.RESID.) 14852
## ALLEY ALLEY 2307
## Residence Residence 5891
## Sidewalk Sidewalk 463
## cha cha 410
## STREET STREET 156553
## VACANT LOT/LAND VACANT LOT/LAND 985
## VEHICLE NON-COMMERCIAL VEHICLE NON-COMMERCIAL 817
## .n.Fit .n.Tst .freqRatio.Fit .freqRatio.OOB
## GAS STATION 2111 2111 0.011016078 0.011016078
## CommercialVehicle 648 648 0.003381534 0.003381534
## School 415 415 0.002165643 0.002165643
## Government 320 320 0.001669893 0.001669893
## Other 5206 5206 0.027167078 0.027167078
## Entertainment 651 651 0.003397189 0.003397189
## PARKING LOT/GARAGE(NON.RESID.) 14852 14852 0.077503927 0.077503927
## ALLEY 2307 2307 0.012038888 0.012038888
## Residence 5891 5891 0.030741694 0.030741694
## Sidewalk 463 463 0.002416127 0.002416127
## cha 410 410 0.002139551 0.002139551
## STREET 156553 156553 0.816958811 0.816958811
## VACANT LOT/LAND 985 985 0.005140141 0.005140141
## VEHICLE NON-COMMERCIAL 817 817 0.004263447 0.004263447
## .freqRatio.Tst err.abs.fit.sum
## GAS STATION 0.011016078 663.07463
## CommercialVehicle 0.003381534 188.66921
## School 0.002165643 106.88868
## Government 0.001669893 82.61155
## Other 0.027167078 1004.74084
## Entertainment 0.003397189 126.31063
## PARKING LOT/GARAGE(NON.RESID.) 0.077503927 2776.35723
## ALLEY 0.012038888 420.97635
## Residence 0.030741694 920.76198
## Sidewalk 0.002416127 71.96968
## cha 0.002139551 60.51845
## STREET 0.816958811 21514.75804
## VACANT LOT/LAND 0.005140141 133.15761
## VEHICLE NON-COMMERCIAL 0.004263447 99.27551
## err.abs.fit.mean .n.fit err.abs.OOB.sum
## GAS STATION 0.3141045 2111 732.87713
## CommercialVehicle 0.2911562 648 208.64035
## School 0.2575631 415 119.85835
## Government 0.2581611 320 92.19587
## Other 0.1929967 5206 1148.76009
## Entertainment 0.1940255 651 142.96743
## PARKING LOT/GARAGE(NON.RESID.) 0.1869349 14852 3188.55838
## ALLEY 0.1824778 2307 481.94756
## Residence 0.1562998 5891 1060.26102
## Sidewalk 0.1554421 463 82.59952
## cha 0.1476060 410 71.81180
## STREET 0.1374280 156553 25114.30026
## VACANT LOT/LAND 0.1351854 985 157.09572
## VEHICLE NON-COMMERCIAL 0.1215122 817 114.18044
## err.abs.OOB.mean
## GAS STATION 0.3471706
## CommercialVehicle 0.3219758
## School 0.2888153
## Government 0.2881121
## Other 0.2206608
## Entertainment 0.2196120
## PARKING LOT/GARAGE(NON.RESID.) 0.2146888
## ALLEY 0.2089066
## Residence 0.1799798
## Sidewalk 0.1784007
## cha 0.1751507
## STREET 0.1604204
## VACANT LOT/LAND 0.1594880
## VEHICLE NON-COMMERCIAL 0.1397557
## .n.OOB .n.Fit .n.Tst .freqRatio.Fit
## 1.916290e+05 1.916290e+05 1.916290e+05 1.000000e+00
## .freqRatio.OOB .freqRatio.Tst err.abs.fit.sum err.abs.fit.mean
## 1.000000e+00 1.000000e+00 2.817007e+04 2.730893e+00
## .n.fit err.abs.OOB.sum err.abs.OOB.mean
## 1.916290e+05 3.271605e+04 3.103138e+00
write.csv(glbObsOOB[, c(glbFeatsId,
grep(glb_rsp_var, names(glbObsOOB), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glbOut$pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBobs.csv"), row.names=FALSE)
fit.models_2_chunk_df <-
myadd_chunk(NULL, "fit.models_2_bgn", label.minor = "teardown")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_2_bgn 1 0 teardown 5530.083 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 18 fit.models 8 2 2 5330.965 5530.207 199.242
## 19 fit.models 8 3 3 5530.207 NA NA
# if (sum(is.na(glbObsAll$D.P.http)) > 0)
# stop("fit.models_3: Why is this happening ?")
#stop(here"); glb2Sav()
sync_glb_obs_df <- function() {
# Merge or cbind ?
for (col in setdiff(names(glbObsFit), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "Fit", col] <<- glbObsFit[, col]
for (col in setdiff(names(glbObsFit), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "Fit", col] <<- glbObsFit[, col]
if (all(is.na(glbObsNew[, glb_rsp_var])))
for (col in setdiff(names(glbObsOOB), names(glbObsTrn)))
glbObsTrn[glbObsTrn$.lcn == "OOB", col] <<- glbObsOOB[, col]
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
glbObsAll[glbObsAll$.lcn == "OOB", col] <<- glbObsOOB[, col]
}
sync_glb_obs_df()
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
replay.petrisim(pn = glb_analytics_pn,
replay.trans = (glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord = TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc = TRUE)
## label step_major step_minor label_minor bgn end
## 19 fit.models 8 3 3 5530.207 5635.828
## 20 fit.data.training 9 0 0 5635.829 NA
## elapsed
## 19 105.621
## 20 NA
9.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_models_lst[[glb_fin_mdl_id]]
} else
# if (nrow(glbObsFit) + length(glbObsFitOutliers) == nrow(glbObsTrn))
if (!all(is.na(glbObsNew[, glb_rsp_var])))
{
warning("Final model same as glb_sel_mdl_id")
glb_fin_mdl_id <- paste0("Final.", glb_sel_mdl_id)
glb_fin_mdl <- glb_sel_mdl
glb_models_lst[[glb_fin_mdl_id]] <- glb_fin_mdl
mdlDf <- glb_models_df[glb_models_df$id == glb_sel_mdl_id, ]
mdlDf$id <- glb_fin_mdl_id
glb_models_df <- rbind(glb_models_df, mdlDf)
} else {
if (grepl("RFE\\.X", names(glbMdlFamilies))) {
indepVar <- mygetIndepVar(glb_feats_df)
rfe_trn_results <-
myrun_rfe(glbObsTrn, indepVar, glbRFESizes[["Final"]])
if (!isTRUE(all.equal(sort(predictors(rfe_trn_results)),
sort(predictors(rfe_fit_results))))) {
print("Diffs predictors(rfe_trn_results) vs. predictors(rfe_fit_results):")
print(setdiff(predictors(rfe_trn_results), predictors(rfe_fit_results)))
print("Diffs predictors(rfe_fit_results) vs. predictors(rfe_trn_results):")
print(setdiff(predictors(rfe_fit_results), predictors(rfe_trn_results)))
}
}
# }
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
# Fit selected models on glbObsTrn
for (mdl_id in gsub(".prob", "",
gsub(mygetPredictIds(glb_rsp_var)$value, "", row.names(mdlimp_df), fixed = TRUE),
fixed = TRUE)) {
mdl_id_components <- unlist(strsplit(mdl_id, "[.]"))
mdlIdPfx <- paste0(c(head(mdl_id_components, -1), "Train"),
collapse = ".")
if (grepl("RFE\\.X\\.", mdlIdPfx))
mdlIndepVars <- myadjustInteractionFeats(glb_feats_df, myextract_actual_feats(
predictors(rfe_trn_results))) else
mdlIndepVars <- trim(unlist(
strsplit(glb_models_df[glb_models_df$id == mdl_id, "feats"], "[,]")))
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdlIdPfx,
type = glb_model_type, tune.df = glbMdlTuneParams,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = tail(mdl_id_components, 1))),
indepVar = mdlIndepVars,
rsp_var = glb_rsp_var,
fit_df = glbObsTrn, OOB_df = NULL)
glbObsTrn <- glb_get_predictions(df = glbObsTrn,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
glbObsNew <- glb_get_predictions(df = glbObsNew,
mdl_id = tail(glb_models_df$id, 1),
rsp_var = glb_rsp_var,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
}
}
# "Final" model
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the mdl_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), imp > 5)
if (glb_is_classification && glb_is_binomial)
indepVar <- gsub("(.*)\\.(.*)\\.prob", "\\1\\.Train\\.\\2\\.prob",
row.names(mdlimp_df)) else
indepVar <- gsub("(.*)\\.(.*)", "\\1\\.Train\\.\\2",
row.names(mdlimp_df))
} else
if (grepl("RFE.X", glb_sel_mdl_id, fixed = TRUE)) {
indepVar <- myextract_actual_feats(predictors(rfe_trn_results))
} else indepVar <-
trim(unlist(strsplit(glb_models_df[glb_models_df$id ==
glb_sel_mdl_id
, "feats"], "[,]")))
if (!is.null(glb_preproc_methods) &&
((match_pos <- regexpr(gsub(".", "\\.",
paste(glb_preproc_methods, collapse = "|"),
fixed = TRUE), glb_sel_mdl_id)) != -1))
ths_preProcess <- str_sub(glb_sel_mdl_id, match_pos,
match_pos + attr(match_pos, "match.length") - 1) else
ths_preProcess <- NULL
mdl_id_pfx <- ifelse(grepl("Ensemble", glb_sel_mdl_id),
"Final.Ensemble", "Final")
trnobs_df <- glbObsTrn
if (!is.null(glbObsTrnOutliers[[mdl_id_pfx]])) {
trnobs_df <- glbObsTrn[!(glbObsTrn[, glbFeatsId] %in% glbObsTrnOutliers[[mdl_id_pfx]]), ]
print(sprintf("Outliers removed: %d", nrow(glbObsTrn) - nrow(trnobs_df)))
print(setdiff(glbObsTrn[, glbFeatsId], trnobs_df[, glbFeatsId]))
}
# Force fitting of Final.glm to identify outliers
method_vctr <- unique(c(myparseMdlId(glb_sel_mdl_id)$alg, glbMdlFamilies[["Final"]]))
for (method in method_vctr) {
#source("caret_nominalTrainWorkflow.R")
# glmnet requires at least 2 indep vars
if ((length(indepVar) == 1) && (method %in% "glmnet"))
next
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
trainControl.allowParallel = glbMdlAllowParallel,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method,
train.preProcess = ths_preProcess)),
indepVar = indepVar, rsp_var = glb_rsp_var,
fit_df = trnobs_df, OOB_df = NULL)
if ((length(method_vctr) == 1) || (method != "glm")) {
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "id"]
}
}
}
## Warning: Final model same as glb_sel_mdl_id
rm(ret_lst)
## Warning in rm(ret_lst): object 'ret_lst' not found
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor label_minor bgn end
## 20 fit.data.training 9 0 0 5635.829 5636.297
## 21 fit.data.training 9 1 1 5636.298 NA
## elapsed
## 20 0.468
## 21 NA
#stop(here"); glb2Sav()
if (glb_is_classification && glb_is_binomial)
prob_threshold <- glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"] else
prob_threshold <- NULL
if (grepl("Ensemble", glb_fin_mdl_id)) {
# Get predictions for each model in ensemble; Outliers that have been moved to OOB might not have been predicted yet
mdlEnsembleComps <- unlist(str_split(subset(glb_models_df,
id == glb_fin_mdl_id)$feats, ","))
if (glb_is_classification && glb_is_binomial)
mdlEnsembleComps <- gsub("\\.prob$", "", mdlEnsembleComps)
mdlEnsembleComps <- gsub(paste0("^",
gsub(".", "\\.", mygetPredictIds(glb_rsp_var)$value, fixed = TRUE)),
"", mdlEnsembleComps)
for (mdl_id in mdlEnsembleComps) {
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
glbObsNew <- glb_get_predictions(df = glbObsNew, mdl_id = mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
}
}
glbObsTrn <- glb_get_predictions(df = glbObsTrn, mdl_id = glb_fin_mdl_id,
rsp_var = glb_rsp_var,
prob_threshold_def = prob_threshold)
glb_featsimp_df <- myget_feats_importance(mdl=glb_fin_mdl,
featsimp_df=glb_featsimp_df)
#glb_featsimp_df[, paste0(glb_fin_mdl_id, ".imp")] <- glb_featsimp_df$imp
print(glb_featsimp_df)
## All.X..rcv.glmnet.imp
## Date.day.minutes.poly.2 1.000000e+02
## Date.day.minutes.poly.3 3.025283e+01
## Date.day.minutes.poly.5 2.216548e+01
## Date.day.minutes.poly.1 1.148563e+01
## LocationDescription.my.fctrGAS STATION 3.044642e+00
## LocationDescription.my.fctrCommercialVehicle 2.635189e+00
## LocationDescription.my.fctrGovernment 1.853763e+00
## LocationDescription.my.fctrSchool 1.578745e+00
## Date.minute.fctr(44.2,59.1] 1.050523e+00
## Date.minute.fctr(14.8,29.5] 1.004957e+00
## LocationDescription.my.fctrEntertainment 9.472644e-01
## LocationDescription.my.fctrOther 8.291560e-01
## Date.year.fctr2011 7.999311e-01
## Date.year.fctr2012 6.950268e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.) 6.770819e-01
## Date.year.fctr2010 6.682230e-01
## LocationDescription.my.fctrResidence 5.815146e-01
## LocationDescription.my.fctrALLEY 5.633250e-01
## Date.year.fctr2007 2.391936e-01
## Date.last32.log1p 2.236680e-01
## Date.year.fctr2003 2.236006e-01
## Date.year.fctr2002 2.073581e-01
## Date.year.fctr2004 1.889544e-01
## Date.minute.fctr(29.5,44.2] 1.870589e-01
## Date.month.fctr02 1.375969e-01
## Date.year.fctr2005 1.309955e-01
## Date.wkend 1.307452e-01
## Date.month.fctr12 1.131041e-01
## Date.month.fctr05 1.102015e-01
## District.fctr10-19 6.936166e-02
## Year 6.181592e-02
## Date.date.fctr(13,19] 5.350073e-02
## Date.wkday.fctr4 3.937589e-02
## .rnorm 1.378027e-02
## Date.last2.log1p 1.142459e-02
## Date.last16.log1p 4.170085e-04
## .pos.y 3.106683e-06
## .pos 3.039085e-06
## ID 1.144659e-07
## Beat 0.000000e+00
## Date.date.fctr(7,13] 0.000000e+00
## Date.date.fctr(19,25] 0.000000e+00
## Date.date.fctr(25,31] 0.000000e+00
## Date.day.minutes.poly.4 0.000000e+00
## Date.hour.fctr(7.67,15.3] 0.000000e+00
## Date.hour.fctr(15.3,23] 0.000000e+00
## Date.juliandate 0.000000e+00
## Date.last4.log1p 0.000000e+00
## Date.last8.log1p 0.000000e+00
## Date.month.fctr03 0.000000e+00
## Date.month.fctr04 0.000000e+00
## Date.month.fctr06 0.000000e+00
## Date.month.fctr07 0.000000e+00
## Date.month.fctr08 0.000000e+00
## Date.month.fctr09 0.000000e+00
## Date.month.fctr10 0.000000e+00
## Date.month.fctr11 0.000000e+00
## Date.wkday.fctr1 0.000000e+00
## Date.wkday.fctr2 0.000000e+00
## Date.wkday.fctr3 0.000000e+00
## Date.wkday.fctr5 0.000000e+00
## Date.wkday.fctr6 0.000000e+00
## Date.year.fctr2006 0.000000e+00
## Date.year.fctr2008 0.000000e+00
## Date.year.fctr2009 0.000000e+00
## District.fctr1-9 0.000000e+00
## District.fctr20+ 0.000000e+00
## LocationDescription.my.fctrSidewalk 0.000000e+00
## LocationDescription.my.fctrVACANT LOT/LAND 0.000000e+00
## LocationDescription.my.fctrVEHICLE NON-COMMERCIAL 0.000000e+00
## LocationDescription.my.fctrcha 0.000000e+00
## imp
## Date.day.minutes.poly.2 1.000000e+02
## Date.day.minutes.poly.3 3.025283e+01
## Date.day.minutes.poly.5 2.216548e+01
## Date.day.minutes.poly.1 1.148563e+01
## LocationDescription.my.fctrGAS STATION 3.044642e+00
## LocationDescription.my.fctrCommercialVehicle 2.635189e+00
## LocationDescription.my.fctrGovernment 1.853763e+00
## LocationDescription.my.fctrSchool 1.578745e+00
## Date.minute.fctr(44.2,59.1] 1.050523e+00
## Date.minute.fctr(14.8,29.5] 1.004957e+00
## LocationDescription.my.fctrEntertainment 9.472644e-01
## LocationDescription.my.fctrOther 8.291560e-01
## Date.year.fctr2011 7.999311e-01
## Date.year.fctr2012 6.950268e-01
## LocationDescription.my.fctrPARKING LOT/GARAGE(NON.RESID.) 6.770819e-01
## Date.year.fctr2010 6.682230e-01
## LocationDescription.my.fctrResidence 5.815146e-01
## LocationDescription.my.fctrALLEY 5.633250e-01
## Date.year.fctr2007 2.391936e-01
## Date.last32.log1p 2.236680e-01
## Date.year.fctr2003 2.236006e-01
## Date.year.fctr2002 2.073581e-01
## Date.year.fctr2004 1.889544e-01
## Date.minute.fctr(29.5,44.2] 1.870589e-01
## Date.month.fctr02 1.375969e-01
## Date.year.fctr2005 1.309955e-01
## Date.wkend 1.307452e-01
## Date.month.fctr12 1.131041e-01
## Date.month.fctr05 1.102015e-01
## District.fctr10-19 6.936166e-02
## Year 6.181592e-02
## Date.date.fctr(13,19] 5.350073e-02
## Date.wkday.fctr4 3.937589e-02
## .rnorm 1.378027e-02
## Date.last2.log1p 1.142459e-02
## Date.last16.log1p 4.170085e-04
## .pos.y 3.106683e-06
## .pos 3.039085e-06
## ID 1.144659e-07
## Beat 0.000000e+00
## Date.date.fctr(7,13] 0.000000e+00
## Date.date.fctr(19,25] 0.000000e+00
## Date.date.fctr(25,31] 0.000000e+00
## Date.day.minutes.poly.4 0.000000e+00
## Date.hour.fctr(7.67,15.3] 0.000000e+00
## Date.hour.fctr(15.3,23] 0.000000e+00
## Date.juliandate 0.000000e+00
## Date.last4.log1p 0.000000e+00
## Date.last8.log1p 0.000000e+00
## Date.month.fctr03 0.000000e+00
## Date.month.fctr04 0.000000e+00
## Date.month.fctr06 0.000000e+00
## Date.month.fctr07 0.000000e+00
## Date.month.fctr08 0.000000e+00
## Date.month.fctr09 0.000000e+00
## Date.month.fctr10 0.000000e+00
## Date.month.fctr11 0.000000e+00
## Date.wkday.fctr1 0.000000e+00
## Date.wkday.fctr2 0.000000e+00
## Date.wkday.fctr3 0.000000e+00
## Date.wkday.fctr5 0.000000e+00
## Date.wkday.fctr6 0.000000e+00
## Date.year.fctr2006 0.000000e+00
## Date.year.fctr2008 0.000000e+00
## Date.year.fctr2009 0.000000e+00
## District.fctr1-9 0.000000e+00
## District.fctr20+ 0.000000e+00
## LocationDescription.my.fctrSidewalk 0.000000e+00
## LocationDescription.my.fctrVACANT LOT/LAND 0.000000e+00
## LocationDescription.my.fctrVEHICLE NON-COMMERCIAL 0.000000e+00
## LocationDescription.my.fctrcha 0.000000e+00
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glbObsTrn, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glbObsTrn, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 26
## [1] "Min/Max Boundaries: "
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 1 8812290 T NA <NA>
## 2 8834045 T NA <NA>
## 3 8864270 T NA <NA>
## 4 8949625 T NA <NA>
## 5 8944054 T NA <NA>
## 6 8868931 T NA <NA>
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## Arrest.All.X..rcv.glmnet.is.acc Arrest.Final.All.X..rcv.glmnet.prob
## 1 NA 0.03510473
## 2 NA 0.03591061
## 3 NA 0.03686818
## 4 NA 0.03951605
## 5 NA 0.04552500
## 6 NA 0.04561321
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 1 F TRUE
## 2 F TRUE
## 3 F TRUE
## 4 F TRUE
## 5 F TRUE
## 6 F TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 1 0.9648953
## 2 0.9640894
## 3 0.9631318
## 4 0.9604840
## 5 0.9544750
## 6 0.9543868
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.error .label
## 1 -0.06489527 8812290
## 2 -0.06408939 8834045
## 3 -0.06313182 8864270
## 4 -0.06048395 8949625
## 5 -0.05447500 8944054
## 6 -0.05438679 8868931
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 1 8812290 T NA <NA>
## 5 8944054 T NA <NA>
## 6 8868931 T NA <NA>
## 7 8652015 T NA <NA>
## 14 5828084 F NA <NA>
## 16 5812403 F NA <NA>
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 1 NA NA
## 5 NA NA
## 6 NA NA
## 7 NA NA
## 14 NA NA
## 16 NA NA
## Arrest.All.X..rcv.glmnet.is.acc Arrest.Final.All.X..rcv.glmnet.prob
## 1 NA 0.03510473
## 5 NA 0.04552500
## 6 NA 0.04561321
## 7 NA 0.06109537
## 14 NA 0.10012220
## 16 NA 0.10788852
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 1 F TRUE
## 5 F TRUE
## 6 F TRUE
## 7 F TRUE
## 14 T TRUE
## 16 T TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 1 0.9648953
## 5 0.9544750
## 6 0.9543868
## 7 0.9389046
## 14 0.1001222
## 16 0.1078885
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## 14 FALSE
## 16 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 1 FALSE
## 5 FALSE
## 6 FALSE
## 7 FALSE
## 14 FALSE
## 16 FALSE
## Arrest.Final.All.X..rcv.glmnet.error .label
## 1 -0.0648952733 8812290
## 5 -0.0544750025 8944054
## 6 -0.0543867887 8868931
## 7 -0.0389046278 8652015
## 14 0.0001221975 5828084
## 16 0.0078885226 5812403
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 16 5812403 F NA <NA>
## 17 5388451 F NA <NA>
## 18 6171778 F NA <NA>
## 19 5696831 F NA <NA>
## 20 5627081 F NA <NA>
## 21 3732883 F NA <NA>
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 16 NA NA
## 17 NA NA
## 18 NA NA
## 19 NA NA
## 20 NA NA
## 21 NA NA
## Arrest.All.X..rcv.glmnet.is.acc Arrest.Final.All.X..rcv.glmnet.prob
## 16 NA 0.1078885
## 17 NA 0.1126194
## 18 NA 0.1135136
## 19 NA 0.1147054
## 20 NA 0.1219047
## 21 NA 0.2609324
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 16 T TRUE
## 17 T TRUE
## 18 T TRUE
## 19 T TRUE
## 20 T TRUE
## 21 T TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 16 0.1078885
## 17 0.1126194
## 18 0.1135136
## 19 0.1147054
## 20 0.1219047
## 21 0.2609324
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 16 FALSE
## 17 FALSE
## 18 FALSE
## 19 FALSE
## 20 FALSE
## 21 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 16 FALSE
## 17 FALSE
## 18 FALSE
## 19 FALSE
## 20 FALSE
## 21 FALSE
## Arrest.Final.All.X..rcv.glmnet.error .label
## 16 0.007888523 5812403
## 17 0.012619383 5388451
## 18 0.013513558 6171778
## 19 0.014705374 5696831
## 20 0.021904740 5627081
## 21 0.160932443 3732883
## [1] "Inaccurate: "
## ID Arrest Arrest.All.X..rcv.glmnet.prob Arrest.All.X..rcv.glmnet
## 1 8596516 T NA <NA>
## 2 8811336 T NA <NA>
## 3 8872919 T NA <NA>
## 4 8909546 T NA <NA>
## 5 8172456 T NA <NA>
## 6 8889814 T NA <NA>
## Arrest.All.X..rcv.glmnet.err Arrest.All.X..rcv.glmnet.err.abs
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## Arrest.All.X..rcv.glmnet.is.acc Arrest.Final.All.X..rcv.glmnet.prob
## 1 NA 0.03341265
## 2 NA 0.03374826
## 3 NA 0.03411970
## 4 NA 0.03429021
## 5 NA 0.03445456
## 6 NA 0.03452531
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 1 F TRUE
## 2 F TRUE
## 3 F TRUE
## 4 F TRUE
## 5 F TRUE
## 6 F TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 1 0.9665874
## 2 0.9662517
## 3 0.9658803
## 4 0.9657098
## 5 0.9655454
## 6 0.9654747
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.error
## 1 -0.06658735
## 2 -0.06625174
## 3 -0.06588030
## 4 -0.06570979
## 5 -0.06554544
## 6 -0.06547469
## ID Arrest Arrest.All.X..rcv.glmnet.prob
## 3576 5581463 T NA
## 4353 5046417 T NA
## 9056 2996815 T NA
## 17377 2697718 F NA
## 38105 1874656 F NA
## 39001 2876481 F NA
## Arrest.All.X..rcv.glmnet Arrest.All.X..rcv.glmnet.err
## 3576 <NA> NA
## 4353 <NA> NA
## 9056 <NA> NA
## 17377 <NA> NA
## 38105 <NA> NA
## 39001 <NA> NA
## Arrest.All.X..rcv.glmnet.err.abs Arrest.All.X..rcv.glmnet.is.acc
## 3576 NA NA
## 4353 NA NA
## 9056 NA NA
## 17377 NA NA
## 38105 NA NA
## 39001 NA NA
## Arrest.Final.All.X..rcv.glmnet.prob Arrest.Final.All.X..rcv.glmnet
## 3576 0.07186872 F
## 4353 0.07604893 F
## 9056 0.09573999 F
## 17377 0.10343055 T
## 38105 0.12562093 T
## 39001 0.12745382 T
## Arrest.Final.All.X..rcv.glmnet.err
## 3576 TRUE
## 4353 TRUE
## 9056 TRUE
## 17377 TRUE
## 38105 TRUE
## 39001 TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 3576 0.9281313
## 4353 0.9239511
## 9056 0.9042600
## 17377 0.1034306
## 38105 0.1256209
## 39001 0.1274538
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 3576 FALSE
## 4353 FALSE
## 9056 FALSE
## 17377 FALSE
## 38105 FALSE
## 39001 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 3576 FALSE
## 4353 FALSE
## 9056 FALSE
## 17377 FALSE
## 38105 FALSE
## 39001 FALSE
## Arrest.Final.All.X..rcv.glmnet.error
## 3576 -0.028131277
## 4353 -0.023951075
## 9056 -0.004260006
## 17377 0.003430551
## 38105 0.025620926
## 39001 0.027453817
## ID Arrest Arrest.All.X..rcv.glmnet.prob
## 47483 1684852 F NA
## 47484 1343195 F NA
## 47485 1320838 F NA
## 47486 1394439 F NA
## 47487 2046674 F NA
## 47488 2004388 F NA
## Arrest.All.X..rcv.glmnet Arrest.All.X..rcv.glmnet.err
## 47483 <NA> NA
## 47484 <NA> NA
## 47485 <NA> NA
## 47486 <NA> NA
## 47487 <NA> NA
## 47488 <NA> NA
## Arrest.All.X..rcv.glmnet.err.abs Arrest.All.X..rcv.glmnet.is.acc
## 47483 NA NA
## 47484 NA NA
## 47485 NA NA
## 47486 NA NA
## 47487 NA NA
## 47488 NA NA
## Arrest.Final.All.X..rcv.glmnet.prob Arrest.Final.All.X..rcv.glmnet
## 47483 0.3024590 T
## 47484 0.3025230 T
## 47485 0.3053358 T
## 47486 0.3065130 T
## 47487 0.3066945 T
## 47488 0.3089428 T
## Arrest.Final.All.X..rcv.glmnet.err
## 47483 TRUE
## 47484 TRUE
## 47485 TRUE
## 47486 TRUE
## 47487 TRUE
## 47488 TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 47483 0.3024590
## 47484 0.3025230
## 47485 0.3053358
## 47486 0.3065130
## 47487 0.3066945
## 47488 0.3089428
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 47483 FALSE
## 47484 FALSE
## 47485 FALSE
## 47486 FALSE
## 47487 FALSE
## 47488 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 47483 FALSE
## 47484 FALSE
## 47485 FALSE
## 47486 FALSE
## 47487 FALSE
## 47488 FALSE
## Arrest.Final.All.X..rcv.glmnet.error
## 47483 0.2024590
## 47484 0.2025230
## 47485 0.2053358
## 47486 0.2065130
## 47487 0.2066945
## 47488 0.2089428
dsp_feats_vctr <- c(NULL)
for(var in grep(".imp", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glbObsTrn[glbObsTrn$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glbObsTrn), value=TRUE)])
print(setdiff(names(glbObsTrn), names(glbObsAll)))
## [1] "Arrest.Final.All.X..rcv.glmnet.prob"
## [2] "Arrest.Final.All.X..rcv.glmnet"
## [3] "Arrest.Final.All.X..rcv.glmnet.err"
## [4] "Arrest.Final.All.X..rcv.glmnet.err.abs"
## [5] "Arrest.Final.All.X..rcv.glmnet.is.acc"
for (col in setdiff(names(glbObsTrn), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.src == "Train", col] <- glbObsTrn[, col]
print(setdiff(names(glbObsFit), names(glbObsAll)))
## character(0)
print(setdiff(names(glbObsOOB), names(glbObsAll)))
## character(0)
for (col in setdiff(names(glbObsOOB), names(glbObsAll)))
# Merge or cbind ?
glbObsAll[glbObsAll$.lcn == "OOB", col] <- glbObsOOB[, col]
print(setdiff(names(glbObsNew), names(glbObsAll)))
## character(0)
#glb2Sav(); all.equal(savObsAll, glbObsAll); all.equal(sav_models_lst, glb_models_lst)
#load(file = paste0(glbOut$pfx, "dsk_knitr.RData"))
#cmpCols <- names(glbObsAll)[!grepl("\\.Final\\.", names(glbObsAll))]; all.equal(savObsAll[, cmpCols], glbObsAll[, cmpCols]); all.equal(savObsAll[, "H.P.http"], glbObsAll[, "H.P.http"]);
replay.petrisim(pn = glb_analytics_pn,
replay.trans = (glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord = TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc = TRUE)
## label step_major step_minor label_minor bgn end
## 21 fit.data.training 9 1 1 5636.298 5788.618
## 22 predict.data.new 10 0 0 5788.618 NA
## elapsed
## 21 152.32
## 22 NA
10.0: predict data new## Warning in glb_analytics_diag_plots(obs_df = glbObsNew, mdl_id =
## glb_fin_mdl_id, : Limiting important feature scatter plots to 5 out of 26
## [1] "Min/Max Boundaries: "
## ID Arrest Arrest.Final.All.X..rcv.glmnet.prob
## 1 8812290 T 0.04960291
## 2 8834045 T 0.05071953
## 3 8864270 T 0.05257967
## 4 8949625 T 0.05416327
## 5 8868931 T 0.06127878
## 6 8944054 T 0.06188272
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 1 F TRUE
## 2 F TRUE
## 3 F TRUE
## 4 F TRUE
## 5 F TRUE
## 6 F TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 1 0.9503971
## 2 0.9492805
## 3 0.9474203
## 4 0.9458367
## 5 0.9387212
## 6 0.9381173
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.error .label
## 1 -0.05039709 8812290
## 2 -0.04928047 8834045
## 3 -0.04742033 8864270
## 4 -0.04583673 8949625
## 5 -0.03872122 8868931
## 6 -0.03811728 8944054
## ID Arrest Arrest.Final.All.X..rcv.glmnet.prob
## 2 8834045 T 0.05071953
## 7 8652015 T 0.08516781
## 15 6133577 F 0.11778570
## 18 6171778 F 0.15343180
## 19 5627081 F 0.16585005
## 21 3732883 F 0.33460660
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 2 F TRUE
## 7 F TRUE
## 15 T TRUE
## 18 T TRUE
## 19 T TRUE
## 21 T TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 2 0.9492805
## 7 0.9148322
## 15 0.1177857
## 18 0.1534318
## 19 0.1658500
## 21 0.3346066
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 2 FALSE
## 7 FALSE
## 15 FALSE
## 18 FALSE
## 19 FALSE
## 21 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 2 FALSE
## 7 FALSE
## 15 FALSE
## 18 FALSE
## 19 FALSE
## 21 FALSE
## Arrest.Final.All.X..rcv.glmnet.error .label
## 2 -0.04928047 8834045
## 7 -0.01483219 8652015
## 15 0.01778570 6133577
## 18 0.05343180 6171778
## 19 0.06585005 5627081
## 21 0.23460660 3732883
## ID Arrest Arrest.Final.All.X..rcv.glmnet.prob
## 16 6553663 F 0.1234584
## 17 6762713 F 0.1472955
## 18 6171778 F 0.1534318
## 19 5627081 F 0.1658500
## 20 1814843 F 0.1838076
## 21 3732883 F 0.3346066
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 16 T TRUE
## 17 T TRUE
## 18 T TRUE
## 19 T TRUE
## 20 T TRUE
## 21 T TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 16 0.1234584
## 17 0.1472955
## 18 0.1534318
## 19 0.1658500
## 20 0.1838076
## 21 0.3346066
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 16 FALSE
## 17 FALSE
## 18 FALSE
## 19 FALSE
## 20 FALSE
## 21 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 16 FALSE
## 17 FALSE
## 18 FALSE
## 19 FALSE
## 20 FALSE
## 21 FALSE
## Arrest.Final.All.X..rcv.glmnet.error .label
## 16 0.02345841 6553663
## 17 0.04729550 6762713
## 18 0.05343180 6171778
## 19 0.06585005 5627081
## 20 0.08380760 1814843
## 21 0.23460660 3732883
## [1] "Inaccurate: "
## ID Arrest Arrest.Final.All.X..rcv.glmnet.prob
## 1 8596516 T 0.04555389
## 2 8872919 T 0.04765161
## 3 8909546 T 0.04805332
## 4 8811336 T 0.04805341
## 5 8341397 T 0.04841366
## 6 8337628 T 0.04855636
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 1 F TRUE
## 2 F TRUE
## 3 F TRUE
## 4 F TRUE
## 5 F TRUE
## 6 F TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 1 0.9544461
## 2 0.9523484
## 3 0.9519467
## 4 0.9519466
## 5 0.9515863
## 6 0.9514436
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## Arrest.Final.All.X..rcv.glmnet.error
## 1 -0.05444611
## 2 -0.05234839
## 3 -0.05194668
## 4 -0.05194659
## 5 -0.05158634
## 6 -0.05144364
## ID Arrest Arrest.Final.All.X..rcv.glmnet.prob
## 17416 3648343 F 0.1075532
## 30539 4482356 F 0.1150336
## 41989 2896997 F 0.1211258
## 67017 5970112 F 0.1344074
## 73912 1345183 F 0.1385298
## 102127 1577341 F 0.1905102
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 17416 T TRUE
## 30539 T TRUE
## 41989 T TRUE
## 67017 T TRUE
## 73912 T TRUE
## 102127 T TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 17416 0.1075532
## 30539 0.1150336
## 41989 0.1211258
## 67017 0.1344074
## 73912 0.1385298
## 102127 0.1905102
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 17416 FALSE
## 30539 FALSE
## 41989 FALSE
## 67017 FALSE
## 73912 FALSE
## 102127 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 17416 FALSE
## 30539 FALSE
## 41989 FALSE
## 67017 FALSE
## 73912 FALSE
## 102127 FALSE
## Arrest.Final.All.X..rcv.glmnet.error
## 17416 0.007553223
## 30539 0.015033571
## 41989 0.021125786
## 67017 0.034407424
## 73912 0.038529772
## 102127 0.090510238
## ID Arrest Arrest.Final.All.X..rcv.glmnet.prob
## 105960 1684852 F 0.3820876
## 105961 1343195 F 0.3831750
## 105962 2046674 F 0.3840201
## 105963 1320838 F 0.3864216
## 105964 1394439 F 0.3877208
## 105965 2004388 F 0.3886837
## Arrest.Final.All.X..rcv.glmnet Arrest.Final.All.X..rcv.glmnet.err
## 105960 T TRUE
## 105961 T TRUE
## 105962 T TRUE
## 105963 T TRUE
## 105964 T TRUE
## 105965 T TRUE
## Arrest.Final.All.X..rcv.glmnet.err.abs
## 105960 0.3820876
## 105961 0.3831750
## 105962 0.3840201
## 105963 0.3864216
## 105964 0.3877208
## 105965 0.3886837
## Arrest.Final.All.X..rcv.glmnet.is.acc
## 105960 FALSE
## 105961 FALSE
## 105962 FALSE
## 105963 FALSE
## 105964 FALSE
## 105965 FALSE
## Arrest.Final.All.X..rcv.glmnet.accurate
## 105960 FALSE
## 105961 FALSE
## 105962 FALSE
## 105963 FALSE
## 105964 FALSE
## 105965 FALSE
## Arrest.Final.All.X..rcv.glmnet.error
## 105960 0.2820876
## 105961 0.2831750
## 105962 0.2840201
## 105963 0.2864216
## 105964 0.2877208
## 105965 0.2886837
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
## [1] "OOBobs Arrest.All.X..rcv.glmnet F: min < min of Train range: 1"
## ID Arrest.All.X..rcv.glmnet ID.1
## 383272 1310022 F 1310022
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## ID ID -0.09085945 FALSE 0.09085945 <NA> 1
## percentUnique zeroVar nzv is.cor.y.abs.low interaction.feat
## ID 100 FALSE FALSE FALSE NA
## shapiro.test.p.value rsp_var_raw id_var rsp_var max min
## ID 1.66718e-42 FALSE TRUE NA 9181151 1310022
## max.Arrest.F max.Arrest.T min.Arrest.F min.Arrest.T
## ID 9181151 9074795 1310068 1310022
## max.Arrest.All.X..rcv.glmnet.F max.Arrest.All.X..rcv.glmnet.T
## ID 9181151 9100606
## min.Arrest.All.X..rcv.glmnet.F min.Arrest.All.X..rcv.glmnet.T
## ID 1310022 1310455
## max.Arrest.Final.All.X..rcv.glmnet.F
## ID 9181151
## max.Arrest.Final.All.X..rcv.glmnet.T
## ID 9100606
## min.Arrest.Final.All.X..rcv.glmnet.F
## ID 1310022
## min.Arrest.Final.All.X..rcv.glmnet.T
## ID 1310455
## [1] "OOBobs Arrest.All.X..rcv.glmnet F: max > max of Train range: 77722"
## ID Arrest.All.X..rcv.glmnet .pos .pos.y
## 191642 8951354 F 191642 191642
## 191643 8951141 F 191643 191643
## 191644 8952745 F 191644 191644
## 191645 8952223 F 191645 191645
## 191646 8951608 F 191646 191646
## 191647 8950793 F 191647 191647
## ID Arrest.All.X..rcv.glmnet .pos .pos.y
## 194514 8856196 F 194514 194514
## 222862 7831914 F 222862 222862
## 266177 5857481 F 266177 266177
## 279209 5142603 F 279209 279209
## 283757 4915236 F 283757 283757
## 299165 4307864 F 299165 299165
## ID Arrest.All.X..rcv.glmnet .pos .pos.y
## 383277 1310755 F 383277 383277
## 383278 1310068 F 383278 383278
## 383279 1313404 F 383279 383279
## 383280 1313442 F 383280 383280
## 383281 1563324 F 383281 383281
## 383282 1310463 F 383282 383282
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## .pos .pos 0.08947413 FALSE 0.08947413 ID 1
## .pos.y .pos.y 0.08947413 FALSE 0.08947413 .pos 1
## percentUnique zeroVar nzv is.cor.y.abs.low interaction.feat
## .pos 100 FALSE FALSE FALSE NA
## .pos.y 100 FALSE FALSE FALSE NA
## shapiro.test.p.value rsp_var_raw id_var rsp_var max min
## .pos 5.299237e-37 FALSE NA NA 383282 1
## .pos.y 2.227987e-37 FALSE NA NA 383282 1
## max.Arrest.F max.Arrest.T min.Arrest.F min.Arrest.T
## .pos 191641 191631 1 6
## .pos.y 191641 191631 1 6
## max.Arrest.All.X..rcv.glmnet.F max.Arrest.All.X..rcv.glmnet.T
## .pos 383282 383273
## .pos.y 383282 383273
## min.Arrest.All.X..rcv.glmnet.F min.Arrest.All.X..rcv.glmnet.T
## .pos 191642 191655
## .pos.y 191642 191655
## max.Arrest.Final.All.X..rcv.glmnet.F
## .pos 383282
## .pos.y 383282
## max.Arrest.Final.All.X..rcv.glmnet.T
## .pos 383273
## .pos.y 383273
## min.Arrest.Final.All.X..rcv.glmnet.F
## .pos 191642
## .pos.y 191642
## min.Arrest.Final.All.X..rcv.glmnet.T
## .pos 191655
## .pos.y 191655
## [1] "OOBobs Arrest.All.X..rcv.glmnet T: min < min of Train range: 43"
## ID Arrest.All.X..rcv.glmnet Date.day.minutes.poly.4
## 244904 6914116 T -0.0030216044
## 246961 6787203 T -0.0030216044
## 288888 4701491 T -0.0030216044
## 295329 4470160 T -0.0013474541
## 298824 4321368 T 0.0024865918
## 299027 4308935 T -0.0002141643
## Date.last16.log1p
## 244904 10.223431
## 246961 9.611864
## 288888 9.827740
## 295329 0.000000
## 298824 4.110874
## 299027 0.000000
## ID Arrest.All.X..rcv.glmnet Date.day.minutes.poly.4
## 301159 4183069 T -0.0030216044
## 316864 3477584 T 0.0019011326
## 350055 2326431 T 0.0019011326
## 367300 1803839 T -0.0002141643
## 367594 1795659 T -0.0002141643
## 367687 1797652 T -0.0002141643
## Date.last16.log1p
## 301159 9.920394
## 316864 0.000000
## 350055 0.000000
## 367300 0.000000
## 367594 0.000000
## 367687 0.000000
## ID Arrest.All.X..rcv.glmnet Date.day.minutes.poly.4
## 369985 1736330 T -0.0002141643
## 369986 1735435 T -0.0002141643
## 372533 1657791 T 0.0019011326
## 372534 1658103 T 0.0019011326
## 377505 1501928 T -0.0002141643
## 377506 1501713 T -0.0002141643
## Date.last16.log1p
## 369985 0
## 369986 0
## 372533 0
## 372534 0
## 377505 0
## 377506 0
## id cor.y
## Date.day.minutes.poly.4 Date.day.minutes.poly.4 0.003308926
## Date.last16.log1p Date.last16.log1p 0.002532701
## exclude.as.feat cor.y.abs cor.high.X freqRatio
## Date.day.minutes.poly.4 FALSE 0.003308926 <NA> 1.214868
## Date.last16.log1p FALSE 0.002532701 <NA> 1.157359
## percentUnique zeroVar nzv is.cor.y.abs.low
## Date.day.minutes.poly.4 0.7498865 FALSE FALSE TRUE
## Date.last16.log1p 0.4268665 FALSE FALSE TRUE
## interaction.feat shapiro.test.p.value rsp_var_raw
## Date.day.minutes.poly.4 NA 9.607483e-34 FALSE
## Date.last16.log1p NA 8.473255e-18 FALSE
## id_var rsp_var max min
## Date.day.minutes.poly.4 NA NA 0.005189768 -0.003021604
## Date.last16.log1p NA NA 11.133142723 0.000000000
## max.Arrest.F max.Arrest.T min.Arrest.F
## Date.day.minutes.poly.4 0.005189768 0.005189768 -0.003021604
## Date.last16.log1p 11.133142723 11.079076314 0.000000000
## min.Arrest.T max.Arrest.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 -0.003021555 0.005189768
## Date.last16.log1p 5.707110265 11.092869425
## max.Arrest.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 0.005189768
## Date.last16.log1p 11.133142723
## min.Arrest.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 -0.003021555
## Date.last16.log1p 0.000000000
## min.Arrest.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 -0.003021604
## Date.last16.log1p 0.000000000
## max.Arrest.Final.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 0.005189768
## Date.last16.log1p 11.092869425
## max.Arrest.Final.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 0.005189768
## Date.last16.log1p 11.133142723
## min.Arrest.Final.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 -0.003021555
## Date.last16.log1p 0.000000000
## min.Arrest.Final.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 -0.003021604
## Date.last16.log1p 0.000000000
## [1] "OOBobs Arrest.All.X..rcv.glmnet T: max > max of Train range: 113907"
## ID Arrest.All.X..rcv.glmnet .pos .pos.y Date.last16.log1p
## 191655 8950570 T 191655 191655 10.034560
## 191666 8950307 T 191666 191666 9.862718
## 191748 8948353 T 191748 191748 9.575053
## 191855 8945802 T 191855 191855 9.692828
## 191920 8943877 T 191920 191920 10.491302
## 191932 8943595 T 191932 191932 10.257694
## Date.last8.log1p ID.1
## 191655 7.090910 8950570
## 191666 8.476580 8950307
## 191748 9.287394 8948353
## 191855 9.287394 8945802
## 191920 10.268165 8943877
## 191932 9.966509 8943595
## ID Arrest.All.X..rcv.glmnet .pos .pos.y Date.last16.log1p
## 287244 4762777 T 287244 287244 9.692828
## 296905 4411747 T 296905 296905 9.729194
## 335778 2812247 T 335778 335778 9.105091
## 342415 2575417 T 342415 342415 9.137877
## 351085 2289044 T 351085 351085 9.872048
## 373961 1617950 T 373961 373961 8.881975
## Date.last8.log1p ID.1
## 287244 8.412055 4762777
## 296905 9.575053 4411747
## 335778 8.188967 2812247
## 342415 7.650169 2575417
## 351085 9.407633 2289044
## 373961 0.000000 1617950
## ID Arrest.All.X..rcv.glmnet .pos .pos.y Date.last16.log1p
## 383263 1850130 T 383263 383263 9.980495
## 383264 1312756 T 383264 383264 9.341456
## 383265 1310455 T 383265 383265 9.341456
## 383266 1918518 T 383266 383266 9.575053
## 383268 1918550 T 383268 383268 9.541154
## 383273 1310583 T 383273 383273 8.425297
## Date.last8.log1p ID.1
## 383263 9.575053 1850130
## 383264 8.237744 1312756
## 383265 8.237744 1310455
## 383266 8.594339 1918518
## 383268 8.938663 1918550
## 383273 8.371242 1310583
## id cor.y exclude.as.feat
## .pos .pos 0.089474127 FALSE
## .pos.y .pos.y 0.089474127 FALSE
## Date.last16.log1p Date.last16.log1p 0.002532701 FALSE
## Date.last8.log1p Date.last8.log1p -0.002512872 FALSE
## ID ID -0.090859450 FALSE
## cor.y.abs cor.high.X freqRatio percentUnique zeroVar
## .pos 0.089474127 ID 1.000000 100.0000000 FALSE
## .pos.y 0.089474127 .pos 1.000000 100.0000000 FALSE
## Date.last16.log1p 0.002532701 <NA> 1.157359 0.4268665 FALSE
## Date.last8.log1p 0.002512872 <NA> 1.002906 0.3073648 FALSE
## ID 0.090859450 <NA> 1.000000 100.0000000 FALSE
## nzv is.cor.y.abs.low interaction.feat
## .pos FALSE FALSE NA
## .pos.y FALSE FALSE NA
## Date.last16.log1p FALSE TRUE NA
## Date.last8.log1p FALSE TRUE NA
## ID FALSE FALSE NA
## shapiro.test.p.value rsp_var_raw id_var rsp_var
## .pos 5.299237e-37 FALSE NA NA
## .pos.y 2.227987e-37 FALSE NA NA
## Date.last16.log1p 8.473255e-18 FALSE NA NA
## Date.last8.log1p 1.818586e-44 FALSE NA NA
## ID 1.667180e-42 FALSE TRUE NA
## max min max.Arrest.F max.Arrest.T
## .pos 3.832820e+05 1 1.916410e+05 1.916310e+05
## .pos.y 3.832820e+05 1 1.916410e+05 1.916310e+05
## Date.last16.log1p 1.113314e+01 0 1.113314e+01 1.107908e+01
## Date.last8.log1p 1.090230e+01 0 1.090230e+01 1.071444e+01
## ID 9.181151e+06 1310022 9.181151e+06 9.074795e+06
## min.Arrest.F min.Arrest.T max.Arrest.All.X..rcv.glmnet.F
## .pos 1 6.000000e+00 3.832820e+05
## .pos.y 1 6.000000e+00 3.832820e+05
## Date.last16.log1p 0 5.707110e+00 1.109287e+01
## Date.last8.log1p 0 0.000000e+00 1.090230e+01
## ID 1310068 1.310022e+06 9.181151e+06
## max.Arrest.All.X..rcv.glmnet.T
## .pos 3.832730e+05
## .pos.y 3.832730e+05
## Date.last16.log1p 1.113314e+01
## Date.last8.log1p 1.079140e+01
## ID 9.100606e+06
## min.Arrest.All.X..rcv.glmnet.F
## .pos 191642
## .pos.y 191642
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310022
## min.Arrest.All.X..rcv.glmnet.T
## .pos 191655
## .pos.y 191655
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310455
## max.Arrest.Final.All.X..rcv.glmnet.F
## .pos 3.832820e+05
## .pos.y 3.832820e+05
## Date.last16.log1p 1.109287e+01
## Date.last8.log1p 1.090230e+01
## ID 9.181151e+06
## max.Arrest.Final.All.X..rcv.glmnet.T
## .pos 3.832730e+05
## .pos.y 3.832730e+05
## Date.last16.log1p 1.113314e+01
## Date.last8.log1p 1.079140e+01
## ID 9.100606e+06
## min.Arrest.Final.All.X..rcv.glmnet.F
## .pos 191642
## .pos.y 191642
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310022
## min.Arrest.Final.All.X..rcv.glmnet.T
## .pos 191655
## .pos.y 191655
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310455
## [1] "OOBobs total range outliers: 191629"
## [1] "newobs Arrest.Final.All.X..rcv.glmnet F: min < min of Train range: 1"
## ID Arrest.Final.All.X..rcv.glmnet ID.1
## 383272 1310022 F 1310022
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## ID ID -0.09085945 FALSE 0.09085945 <NA> 1
## percentUnique zeroVar nzv is.cor.y.abs.low interaction.feat
## ID 100 FALSE FALSE FALSE NA
## shapiro.test.p.value rsp_var_raw id_var rsp_var max min
## ID 1.66718e-42 FALSE TRUE NA 9181151 1310022
## max.Arrest.F max.Arrest.T min.Arrest.F min.Arrest.T
## ID 9181151 9074795 1310068 1310022
## max.Arrest.All.X..rcv.glmnet.F max.Arrest.All.X..rcv.glmnet.T
## ID 9181151 9100606
## min.Arrest.All.X..rcv.glmnet.F min.Arrest.All.X..rcv.glmnet.T
## ID 1310022 1310455
## max.Arrest.Final.All.X..rcv.glmnet.F
## ID 9181151
## max.Arrest.Final.All.X..rcv.glmnet.T
## ID 9100606
## min.Arrest.Final.All.X..rcv.glmnet.F
## ID 1310022
## min.Arrest.Final.All.X..rcv.glmnet.T
## ID 1310455
## [1] "newobs Arrest.Final.All.X..rcv.glmnet F: max > max of Train range: 77722"
## ID Arrest.Final.All.X..rcv.glmnet .pos .pos.y
## 191642 8951354 F 191642 191642
## 191643 8951141 F 191643 191643
## 191644 8952745 F 191644 191644
## 191645 8952223 F 191645 191645
## 191646 8951608 F 191646 191646
## 191647 8950793 F 191647 191647
## ID Arrest.Final.All.X..rcv.glmnet .pos .pos.y
## 195658 8821341 F 195658 195658
## 231699 7512874 F 231699 231699
## 253425 6475696 F 253425 253425
## 255027 6402953 F 255027 255027
## 267914 5769119 F 267914 267914
## 280102 5097071 F 280102 280102
## ID Arrest.Final.All.X..rcv.glmnet .pos .pos.y
## 383277 1310755 F 383277 383277
## 383278 1310068 F 383278 383278
## 383279 1313404 F 383279 383279
## 383280 1313442 F 383280 383280
## 383281 1563324 F 383281 383281
## 383282 1310463 F 383282 383282
## id cor.y exclude.as.feat cor.y.abs cor.high.X freqRatio
## .pos .pos 0.08947413 FALSE 0.08947413 ID 1
## .pos.y .pos.y 0.08947413 FALSE 0.08947413 .pos 1
## percentUnique zeroVar nzv is.cor.y.abs.low interaction.feat
## .pos 100 FALSE FALSE FALSE NA
## .pos.y 100 FALSE FALSE FALSE NA
## shapiro.test.p.value rsp_var_raw id_var rsp_var max min
## .pos 5.299237e-37 FALSE NA NA 383282 1
## .pos.y 2.227987e-37 FALSE NA NA 383282 1
## max.Arrest.F max.Arrest.T min.Arrest.F min.Arrest.T
## .pos 191641 191631 1 6
## .pos.y 191641 191631 1 6
## max.Arrest.All.X..rcv.glmnet.F max.Arrest.All.X..rcv.glmnet.T
## .pos 383282 383273
## .pos.y 383282 383273
## min.Arrest.All.X..rcv.glmnet.F min.Arrest.All.X..rcv.glmnet.T
## .pos 191642 191655
## .pos.y 191642 191655
## max.Arrest.Final.All.X..rcv.glmnet.F
## .pos 383282
## .pos.y 383282
## max.Arrest.Final.All.X..rcv.glmnet.T
## .pos 383273
## .pos.y 383273
## min.Arrest.Final.All.X..rcv.glmnet.F
## .pos 191642
## .pos.y 191642
## min.Arrest.Final.All.X..rcv.glmnet.T
## .pos 191655
## .pos.y 191655
## [1] "newobs Arrest.Final.All.X..rcv.glmnet T: min < min of Train range: 43"
## ID Arrest.Final.All.X..rcv.glmnet Date.day.minutes.poly.4
## 244904 6914116 T -0.0030216044
## 246961 6787203 T -0.0030216044
## 288888 4701491 T -0.0030216044
## 295329 4470160 T -0.0013474541
## 298824 4321368 T 0.0024865918
## 299027 4308935 T -0.0002141643
## Date.last16.log1p
## 244904 10.223431
## 246961 9.611864
## 288888 9.827740
## 295329 0.000000
## 298824 4.110874
## 299027 0.000000
## ID Arrest.Final.All.X..rcv.glmnet Date.day.minutes.poly.4
## 322543 3272102 T -0.0013474541
## 322544 3272304 T -0.0013474541
## 365234 1856331 T -0.0015816952
## 366646 1829539 T -0.0002141643
## 369985 1736330 T -0.0002141643
## 372534 1658103 T 0.0019011326
## Date.last16.log1p
## 322543 0
## 322544 0
## 365234 0
## 366646 0
## 369985 0
## 372534 0
## ID Arrest.Final.All.X..rcv.glmnet Date.day.minutes.poly.4
## 369985 1736330 T -0.0002141643
## 369986 1735435 T -0.0002141643
## 372533 1657791 T 0.0019011326
## 372534 1658103 T 0.0019011326
## 377505 1501928 T -0.0002141643
## 377506 1501713 T -0.0002141643
## Date.last16.log1p
## 369985 0
## 369986 0
## 372533 0
## 372534 0
## 377505 0
## 377506 0
## id cor.y
## Date.day.minutes.poly.4 Date.day.minutes.poly.4 0.003308926
## Date.last16.log1p Date.last16.log1p 0.002532701
## exclude.as.feat cor.y.abs cor.high.X freqRatio
## Date.day.minutes.poly.4 FALSE 0.003308926 <NA> 1.214868
## Date.last16.log1p FALSE 0.002532701 <NA> 1.157359
## percentUnique zeroVar nzv is.cor.y.abs.low
## Date.day.minutes.poly.4 0.7498865 FALSE FALSE TRUE
## Date.last16.log1p 0.4268665 FALSE FALSE TRUE
## interaction.feat shapiro.test.p.value rsp_var_raw
## Date.day.minutes.poly.4 NA 9.607483e-34 FALSE
## Date.last16.log1p NA 8.473255e-18 FALSE
## id_var rsp_var max min
## Date.day.minutes.poly.4 NA NA 0.005189768 -0.003021604
## Date.last16.log1p NA NA 11.133142723 0.000000000
## max.Arrest.F max.Arrest.T min.Arrest.F
## Date.day.minutes.poly.4 0.005189768 0.005189768 -0.003021604
## Date.last16.log1p 11.133142723 11.079076314 0.000000000
## min.Arrest.T max.Arrest.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 -0.003021555 0.005189768
## Date.last16.log1p 5.707110265 11.092869425
## max.Arrest.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 0.005189768
## Date.last16.log1p 11.133142723
## min.Arrest.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 -0.003021555
## Date.last16.log1p 0.000000000
## min.Arrest.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 -0.003021604
## Date.last16.log1p 0.000000000
## max.Arrest.Final.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 0.005189768
## Date.last16.log1p 11.092869425
## max.Arrest.Final.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 0.005189768
## Date.last16.log1p 11.133142723
## min.Arrest.Final.All.X..rcv.glmnet.F
## Date.day.minutes.poly.4 -0.003021555
## Date.last16.log1p 0.000000000
## min.Arrest.Final.All.X..rcv.glmnet.T
## Date.day.minutes.poly.4 -0.003021604
## Date.last16.log1p 0.000000000
## [1] "newobs Arrest.Final.All.X..rcv.glmnet T: max > max of Train range: 113907"
## ID Arrest.Final.All.X..rcv.glmnet .pos .pos.y
## 191655 8950570 T 191655 191655
## 191666 8950307 T 191666 191666
## 191748 8948353 T 191748 191748
## 191855 8945802 T 191855 191855
## 191920 8943877 T 191920 191920
## 191932 8943595 T 191932 191932
## Date.last16.log1p Date.last8.log1p ID.1
## 191655 10.034560 7.090910 8950570
## 191666 9.862718 8.476580 8950307
## 191748 9.575053 9.287394 8948353
## 191855 9.692828 9.287394 8945802
## 191920 10.491302 10.268165 8943877
## 191932 10.257694 9.966509 8943595
## ID Arrest.Final.All.X..rcv.glmnet .pos .pos.y
## 297716 4381407 T 297716 297716
## 339277 2686623 T 339277 339277
## 343483 2539653 T 343483 343483
## 360177 1992476 T 360177 360177
## 366203 1825347 T 366203 366203
## 368856 1766305 T 368856 368856
## Date.last16.log1p Date.last8.log1p ID.1
## 297716 8.881975 8.188967 4381407
## 339277 10.034560 9.392745 2686623
## 343483 8.922792 8.101981 2539653
## 360177 9.692828 8.594339 1992476
## 366203 9.200391 8.205492 1825347
## 368856 8.594339 8.188967 1766305
## ID Arrest.Final.All.X..rcv.glmnet .pos .pos.y
## 383263 1850130 T 383263 383263
## 383264 1312756 T 383264 383264
## 383265 1310455 T 383265 383265
## 383266 1918518 T 383266 383266
## 383268 1918550 T 383268 383268
## 383273 1310583 T 383273 383273
## Date.last16.log1p Date.last8.log1p ID.1
## 383263 9.980495 9.575053 1850130
## 383264 9.341456 8.237744 1312756
## 383265 9.341456 8.237744 1310455
## 383266 9.575053 8.594339 1918518
## 383268 9.541154 8.938663 1918550
## 383273 8.425297 8.371242 1310583
## id cor.y exclude.as.feat
## .pos .pos 0.089474127 FALSE
## .pos.y .pos.y 0.089474127 FALSE
## Date.last16.log1p Date.last16.log1p 0.002532701 FALSE
## Date.last8.log1p Date.last8.log1p -0.002512872 FALSE
## ID ID -0.090859450 FALSE
## cor.y.abs cor.high.X freqRatio percentUnique zeroVar
## .pos 0.089474127 ID 1.000000 100.0000000 FALSE
## .pos.y 0.089474127 .pos 1.000000 100.0000000 FALSE
## Date.last16.log1p 0.002532701 <NA> 1.157359 0.4268665 FALSE
## Date.last8.log1p 0.002512872 <NA> 1.002906 0.3073648 FALSE
## ID 0.090859450 <NA> 1.000000 100.0000000 FALSE
## nzv is.cor.y.abs.low interaction.feat
## .pos FALSE FALSE NA
## .pos.y FALSE FALSE NA
## Date.last16.log1p FALSE TRUE NA
## Date.last8.log1p FALSE TRUE NA
## ID FALSE FALSE NA
## shapiro.test.p.value rsp_var_raw id_var rsp_var
## .pos 5.299237e-37 FALSE NA NA
## .pos.y 2.227987e-37 FALSE NA NA
## Date.last16.log1p 8.473255e-18 FALSE NA NA
## Date.last8.log1p 1.818586e-44 FALSE NA NA
## ID 1.667180e-42 FALSE TRUE NA
## max min max.Arrest.F max.Arrest.T
## .pos 3.832820e+05 1 1.916410e+05 1.916310e+05
## .pos.y 3.832820e+05 1 1.916410e+05 1.916310e+05
## Date.last16.log1p 1.113314e+01 0 1.113314e+01 1.107908e+01
## Date.last8.log1p 1.090230e+01 0 1.090230e+01 1.071444e+01
## ID 9.181151e+06 1310022 9.181151e+06 9.074795e+06
## min.Arrest.F min.Arrest.T max.Arrest.All.X..rcv.glmnet.F
## .pos 1 6.000000e+00 3.832820e+05
## .pos.y 1 6.000000e+00 3.832820e+05
## Date.last16.log1p 0 5.707110e+00 1.109287e+01
## Date.last8.log1p 0 0.000000e+00 1.090230e+01
## ID 1310068 1.310022e+06 9.181151e+06
## max.Arrest.All.X..rcv.glmnet.T
## .pos 3.832730e+05
## .pos.y 3.832730e+05
## Date.last16.log1p 1.113314e+01
## Date.last8.log1p 1.079140e+01
## ID 9.100606e+06
## min.Arrest.All.X..rcv.glmnet.F
## .pos 191642
## .pos.y 191642
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310022
## min.Arrest.All.X..rcv.glmnet.T
## .pos 191655
## .pos.y 191655
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310455
## max.Arrest.Final.All.X..rcv.glmnet.F
## .pos 3.832820e+05
## .pos.y 3.832820e+05
## Date.last16.log1p 1.109287e+01
## Date.last8.log1p 1.090230e+01
## ID 9.181151e+06
## max.Arrest.Final.All.X..rcv.glmnet.T
## .pos 3.832730e+05
## .pos.y 3.832730e+05
## Date.last16.log1p 1.113314e+01
## Date.last8.log1p 1.079140e+01
## ID 9.100606e+06
## min.Arrest.Final.All.X..rcv.glmnet.F
## .pos 191642
## .pos.y 191642
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310022
## min.Arrest.Final.All.X..rcv.glmnet.T
## .pos 191655
## .pos.y 191655
## Date.last16.log1p 0
## Date.last8.log1p 0
## ID 1310455
## [1] "newobs total range outliers: 191629"
## [1] TRUE
## [1] 0.1
## [1] "glb_sel_mdl_id: All.X##rcv#glmnet"
## [1] "glb_fin_mdl_id: Final.All.X##rcv#glmnet"
## [1] "Cross Validation issues:"
## MFO###myMFO_classfr Random###myrandom_classfr
## 0 0
## Max.cor.Y.rcv.1X1###glmnet
## 0
## max.Accuracy.OOB max.AUCROCR.OOB
## Max.cor.Y.Time.Lag##rcv#glmnet 0.79132595 0.6170827
## Max.cor.Y.Time.Poly##rcv#glmnet 0.75721316 0.6151250
## Max.cor.Y.rcv.1X1###glmnet 0.73198211 0.6119694
## Low.cor.X##rcv#glmnet 0.44703046 0.6249179
## All.X##rcv#glmnet 0.44703046 0.6249179
## Final.All.X##rcv#glmnet 0.44703046 0.6249179
## All.X##rcv#glm 0.08107332 0.6259209
## Interact.High.cor.Y##rcv#glmnet 0.08107332 0.5522346
## Max.cor.Y##rcv#rpart 0.08107332 0.5000000
## MFO###myMFO_classfr 0.08107332 0.5000000
## Random###myrandom_classfr 0.08107332 0.4997988
## max.AUCpROC.OOB max.Accuracy.fit
## Max.cor.Y.Time.Lag##rcv#glmnet 0.5000000 0.91892668
## Max.cor.Y.Time.Poly##rcv#glmnet 0.5000000 0.91892668
## Max.cor.Y.rcv.1X1###glmnet 0.5000000 0.73198211
## Low.cor.X##rcv#glmnet 0.5000000 0.91892668
## All.X##rcv#glmnet 0.5000000 0.91892668
## Final.All.X##rcv#glmnet 0.5000000 0.91892668
## All.X##rcv#glm 0.5000000 0.91892668
## Interact.High.cor.Y##rcv#glmnet 0.5000000 0.91892668
## Max.cor.Y##rcv#rpart 0.5000000 0.91855269
## MFO###myMFO_classfr 0.5000000 0.08107332
## Random###myrandom_classfr 0.4998215 0.08107332
## opt.prob.threshold.fit
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.1
## Low.cor.X##rcv#glmnet 0.1
## All.X##rcv#glmnet 0.1
## Final.All.X##rcv#glmnet 0.1
## All.X##rcv#glm 0.1
## Interact.High.cor.Y##rcv#glmnet 0.1
## Max.cor.Y##rcv#rpart 0.0
## MFO###myMFO_classfr 0.0
## Random###myrandom_classfr 0.0
## opt.prob.threshold.OOB
## Max.cor.Y.Time.Lag##rcv#glmnet 0.1
## Max.cor.Y.Time.Poly##rcv#glmnet 0.1
## Max.cor.Y.rcv.1X1###glmnet 0.1
## Low.cor.X##rcv#glmnet 0.1
## All.X##rcv#glmnet 0.1
## Final.All.X##rcv#glmnet 0.1
## All.X##rcv#glm 0.0
## Interact.High.cor.Y##rcv#glmnet 0.1
## Max.cor.Y##rcv#rpart 0.0
## MFO###myMFO_classfr 0.0
## Random###myrandom_classfr 0.0
## [1] "All.X##rcv#glmnet OOB confusion matrix & accuracy: "
## Prediction
## Reference F T
## F 73925 102168
## T 3797 11739
## err.abs.fit.sum err.abs.OOB.sum
## GAS STATION 663.07463 732.87713
## CommercialVehicle 188.66921 208.64035
## School 106.88868 119.85835
## Government 82.61155 92.19587
## Other 1004.74084 1148.76009
## Entertainment 126.31063 142.96743
## PARKING LOT/GARAGE(NON.RESID.) 2776.35723 3188.55838
## ALLEY 420.97635 481.94756
## Residence 920.76198 1060.26102
## Sidewalk 71.96968 82.59952
## cha 60.51845 71.81180
## STREET 21514.75804 25114.30026
## VACANT LOT/LAND 133.15761 157.09572
## VEHICLE NON-COMMERCIAL 99.27551 114.18044
## err.abs.trn.sum err.abs.new.sum
## GAS STATION 663.07463 732.87713
## CommercialVehicle 188.66921 208.64035
## School 106.88868 119.85835
## Government 82.61155 92.19587
## Other 1004.74084 1148.76009
## Entertainment 126.31063 142.96743
## PARKING LOT/GARAGE(NON.RESID.) 2776.35723 3188.55838
## ALLEY 420.97635 481.94756
## Residence 920.76198 1060.26102
## Sidewalk 71.96968 82.59952
## cha 60.51845 71.81180
## STREET 21514.75804 25114.30026
## VACANT LOT/LAND 133.15761 157.09572
## VEHICLE NON-COMMERCIAL 99.27551 114.18044
## .freqRatio.Fit .freqRatio.OOB
## GAS STATION 0.011016078 0.011016078
## CommercialVehicle 0.003381534 0.003381534
## School 0.002165643 0.002165643
## Government 0.001669893 0.001669893
## Other 0.027167078 0.027167078
## Entertainment 0.003397189 0.003397189
## PARKING LOT/GARAGE(NON.RESID.) 0.077503927 0.077503927
## ALLEY 0.012038888 0.012038888
## Residence 0.030741694 0.030741694
## Sidewalk 0.002416127 0.002416127
## cha 0.002139551 0.002139551
## STREET 0.816958811 0.816958811
## VACANT LOT/LAND 0.005140141 0.005140141
## VEHICLE NON-COMMERCIAL 0.004263447 0.004263447
## .freqRatio.Tst .n.Fit .n.New.F .n.New.T
## GAS STATION 0.011016078 2111 NA 2111
## CommercialVehicle 0.003381534 648 NA 648
## School 0.002165643 415 31 384
## Government 0.001669893 320 7 313
## Other 0.027167078 5206 821 4385
## Entertainment 0.003397189 651 179 472
## PARKING LOT/GARAGE(NON.RESID.) 0.077503927 14852 2722 12130
## ALLEY 0.012038888 2307 468 1839
## Residence 0.030741694 5891 2335 3556
## Sidewalk 0.002416127 463 174 289
## cha 0.002139551 410 92 318
## STREET 0.816958811 156553 69905 86648
## VACANT LOT/LAND 0.005140141 985 374 611
## VEHICLE NON-COMMERCIAL 0.004263447 817 614 203
## .n.OOB .n.Trn.F .n.Trn.T .n.Tst .n.fit
## GAS STATION 2111 1672 439 2111 2111
## CommercialVehicle 648 522 126 648 648
## School 415 347 68 415 415
## Government 320 266 54 320 320
## Other 5206 4614 592 5206 5206
## Entertainment 651 572 79 651 651
## PARKING LOT/GARAGE(NON.RESID.) 14852 13249 1603 14852 14852
## ALLEY 2307 2058 249 2307 2307
## Residence 5891 5358 533 5891 5891
## Sidewalk 463 420 43 463 463
## cha 410 381 29 410 410
## STREET 156553 144958 11595 156553 156553
## VACANT LOT/LAND 985 918 67 985 985
## VEHICLE NON-COMMERCIAL 817 758 59 817 817
## .n.new .n.trn err.abs.OOB.mean
## GAS STATION 2111 2111 0.3471706
## CommercialVehicle 648 648 0.3219758
## School 415 415 0.2888153
## Government 320 320 0.2881121
## Other 5206 5206 0.2206608
## Entertainment 651 651 0.2196120
## PARKING LOT/GARAGE(NON.RESID.) 14852 14852 0.2146888
## ALLEY 2307 2307 0.2089066
## Residence 5891 5891 0.1799798
## Sidewalk 463 463 0.1784007
## cha 410 410 0.1751507
## STREET 156553 156553 0.1604204
## VACANT LOT/LAND 985 985 0.1594880
## VEHICLE NON-COMMERCIAL 817 817 0.1397557
## err.abs.fit.mean err.abs.new.mean
## GAS STATION 0.3141045 0.3471706
## CommercialVehicle 0.2911562 0.3219758
## School 0.2575631 0.2888153
## Government 0.2581611 0.2881121
## Other 0.1929967 0.2206608
## Entertainment 0.1940255 0.2196120
## PARKING LOT/GARAGE(NON.RESID.) 0.1869349 0.2146888
## ALLEY 0.1824778 0.2089066
## Residence 0.1562998 0.1799798
## Sidewalk 0.1554421 0.1784007
## cha 0.1476060 0.1751507
## STREET 0.1374280 0.1604204
## VACANT LOT/LAND 0.1351854 0.1594880
## VEHICLE NON-COMMERCIAL 0.1215122 0.1397557
## err.abs.trn.mean
## GAS STATION 0.3141045
## CommercialVehicle 0.2911562
## School 0.2575631
## Government 0.2581611
## Other 0.1929967
## Entertainment 0.1940255
## PARKING LOT/GARAGE(NON.RESID.) 0.1869349
## ALLEY 0.1824778
## Residence 0.1562998
## Sidewalk 0.1554421
## cha 0.1476060
## STREET 0.1374280
## VACANT LOT/LAND 0.1351854
## VEHICLE NON-COMMERCIAL 0.1215122
## err.abs.fit.sum err.abs.OOB.sum err.abs.trn.sum err.abs.new.sum
## 2.817007e+04 3.271605e+04 2.817007e+04 3.271605e+04
## .freqRatio.Fit .freqRatio.OOB .freqRatio.Tst .n.Fit
## 1.000000e+00 1.000000e+00 1.000000e+00 1.916290e+05
## .n.New.F .n.New.T .n.OOB .n.Trn.F
## NA 1.139070e+05 1.916290e+05 1.760930e+05
## .n.Trn.T .n.Tst .n.fit .n.new
## 1.553600e+04 1.916290e+05 1.916290e+05 1.916290e+05
## .n.trn err.abs.OOB.mean err.abs.fit.mean err.abs.new.mean
## 1.916290e+05 3.103138e+00 2.730893e+00 3.103138e+00
## err.abs.trn.mean
## 2.730893e+00
## [1] "Final.All.X##rcv#glmnet new confusion matrix & accuracy: "
## Prediction
## Reference F T
## F 73925 102168
## T 3797 11739
## [1] "Features Importance for selected models:"
## All.X..rcv.glmnet.imp
## Date.day.minutes.poly.2 100.00000
## Date.day.minutes.poly.3 30.25283
## Date.day.minutes.poly.5 22.16548
## Date.day.minutes.poly.1 11.48563
## [1] "glbObsNew prediction stats:"
##
## F T
## 77722 113907
## label step_major step_minor label_minor bgn
## 22 predict.data.new 10 0 0 5788.618
## 23 display.session.info 11 0 0 6070.360
## end elapsed
## 22 6070.359 281.742
## 23 NA NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor label_minor bgn
## 16 fit.models 8 0 0 552.914
## 17 fit.models 8 1 1 3901.073
## 6 extract.features.datetime 3 1 1 128.198
## 22 predict.data.new 10 0 0 5788.618
## 18 fit.models 8 2 2 5330.965
## 21 fit.data.training 9 1 1 5636.298
## 19 fit.models 8 3 3 5530.207
## 7 extract.features.image 3 2 2 416.398
## 1 import.data 1 0 0 11.867
## 15 select.features 7 0 0 504.131
## 2 inspect.data 2 0 0 85.117
## 14 partition.data.training 6 0 0 495.580
## 3 scrub.data 2 1 1 122.159
## 12 manage.missing.data 4 0 0 491.507
## 11 extract.features.end 3 6 6 490.555
## 13 cluster.data 5 0 0 494.682
## 20 fit.data.training 9 0 0 5635.829
## 10 extract.features.string 3 5 5 490.355
## 9 extract.features.text 3 4 4 490.301
## 4 transform.data 2 2 2 128.129
## 8 extract.features.price 3 3 3 490.265
## 5 extract.features 3 0 0 128.173
## end elapsed duration
## 16 3901.072 3348.159 3348.158
## 17 5330.965 1429.892 1429.892
## 6 416.397 288.200 288.199
## 22 6070.359 281.742 281.741
## 18 5530.207 199.242 199.242
## 21 5788.618 152.320 152.320
## 19 5635.828 105.621 105.621
## 7 490.264 73.866 73.866
## 1 85.116 73.249 73.249
## 15 552.914 48.783 48.783
## 2 122.158 37.042 37.041
## 14 504.130 8.550 8.550
## 3 128.129 5.970 5.970
## 12 494.682 3.175 3.175
## 11 491.507 0.952 0.952
## 13 495.580 0.898 0.898
## 20 5636.297 0.468 0.468
## 10 490.554 0.199 0.199
## 9 490.354 0.053 0.053
## 4 128.172 0.044 0.043
## 8 490.300 0.035 0.035
## 5 128.197 0.024 0.024
## [1] "Total Elapsed Time: 6,070.359 secs"
## label step_major step_minor label_minor
## 8 fit.models_0_Low.cor.X 1 7 glmnet
## 3 fit.models_0_Random 1 2 myrandom_classfr
## 7 fit.models_0_Interact.High.cor.Y 1 6 glmnet
## 6 fit.models_0_Max.cor.Y.Time.Lag 1 5 glmnet
## 5 fit.models_0_Max.cor.Y.Time.Poly 1 4 glmnet
## 4 fit.models_0_Max.cor.Y.rcv.*X* 1 3 glmnet
## 2 fit.models_0_MFO 1 1 myMFO_classfr
## 1 fit.models_0_bgn 1 0 setup
## bgn end elapsed duration
## 8 3034.140 3901.059 866.919 866.919
## 3 599.084 1181.463 582.379 582.379
## 7 2466.773 3034.139 567.366 567.366
## 6 2016.029 2466.772 450.744 450.743
## 5 1570.017 2016.028 446.011 446.011
## 4 1181.463 1570.016 388.554 388.553
## 2 553.693 599.083 45.391 45.390
## 1 553.656 553.693 0.037 0.037
## [1] "Total Elapsed Time: 3,901.059 secs"